{"id":"https://openalex.org/W4308092051","doi":"https://doi.org/10.1109/hpec55821.2022.9926330","title":"Towards Fast Crash-Consistent Cluster Checkpointing","display_name":"Towards Fast Crash-Consistent Cluster Checkpointing","publication_year":2022,"publication_date":"2022-09-19","ids":{"openalex":"https://openalex.org/W4308092051","doi":"https://doi.org/10.1109/hpec55821.2022.9926330"},"language":"en","primary_location":{"id":"doi:10.1109/hpec55821.2022.9926330","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpec55821.2022.9926330","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE High Performance Extreme Computing Conference (HPEC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038525681","display_name":"Andrew W. Wood","orcid":"https://orcid.org/0000-0002-6231-0085"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Andrew Wood","raw_affiliation_strings":["Boston University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Boston University","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001422326","display_name":"Moshik Hershcovitch","orcid":"https://orcid.org/0000-0002-4826-4174"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Moshik Hershcovitch","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091041118","display_name":"Ilias Ennmouri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ilias Ennmouri","raw_affiliation_strings":["IBM"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074024626","display_name":"Weiyu Zong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weiyu Zong","raw_affiliation_strings":["Boston University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Boston University","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012152867","display_name":"Saurav Chennuri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saurav Chennuri","raw_affiliation_strings":["Boston University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Boston University","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049884582","display_name":"Sarel Cohen","orcid":"https://orcid.org/0000-0003-4578-1245"},"institutions":[{"id":"https://openalex.org/I143288331","display_name":"Hasso Plattner Institute","ror":"https://ror.org/058rn5r42","country_code":"DE","type":"facility","lineage":["https://openalex.org/I143288331","https://openalex.org/I176453806"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Sarel Cohen","raw_affiliation_strings":["The Academic College of Tel Aviv-Yaffo &#x0026; Hasso Plattner Institute,Germany"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Academic College of Tel Aviv-Yaffo &#x0026; Hasso Plattner Institute,Germany","institution_ids":["https://openalex.org/I143288331"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103442361","display_name":"Sundararaman Swaminathan","orcid":"https://orcid.org/0000-0003-4773-141X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Swaminathan Sundararaman","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033709518","display_name":"Daniel Waddington","orcid":"https://orcid.org/0000-0001-8758-910X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daniel Waddington","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113696329","display_name":"Peter Chin","orcid":null},"institutions":[{"id":"https://openalex.org/I107672454","display_name":"Dartmouth College","ror":"https://ror.org/049s0rh22","country_code":"US","type":"education","lineage":["https://openalex.org/I107672454"]},{"id":"https://openalex.org/I4210166639","display_name":"Dartmouth Hospital","ror":"https://ror.org/02j3qj605","country_code":"GB","type":"healthcare","lineage":["https://openalex.org/I4210166639"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Peter Chin","raw_affiliation_strings":["Dartmouth College"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Dartmouth College","institution_ids":["https://openalex.org/I4210166639","https://openalex.org/I107672454"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.2852,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.58754279,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9943000078201294,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8817192316055298},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5911828279495239},{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.57829749584198},{"id":"https://openalex.org/keywords/crash","display_name":"Crash","score":0.5608516931533813},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5576004385948181},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.45625242590904236},{"id":"https://openalex.org/keywords/downtime","display_name":"Downtime","score":0.45325711369514465},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.42429202795028687},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4228109121322632},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.4110547602176666},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.19648736715316772}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8817192316055298},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5911828279495239},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.57829749584198},{"id":"https://openalex.org/C183469790","wikidata":"https://www.wikidata.org/wiki/Q333501","display_name":"Crash","level":2,"score":0.5608516931533813},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5576004385948181},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.45625242590904236},{"id":"https://openalex.org/C180591934","wikidata":"https://www.wikidata.org/wiki/Q1253369","display_name":"Downtime","level":2,"score":0.45325711369514465},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.42429202795028687},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4228109121322632},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.4110547602176666},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.19648736715316772}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpec55821.2022.9926330","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpec55821.2022.9926330","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE High Performance Extreme Computing Conference (HPEC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2164287667","https://openalex.org/W1967627035","https://openalex.org/W1188123746","https://openalex.org/W1509211761","https://openalex.org/W2391299576","https://openalex.org/W2319467001","https://openalex.org/W2034393996","https://openalex.org/W1869243490","https://openalex.org/W2133693067","https://openalex.org/W1531488649"],"abstract_inverted_index":{"Machine":[0],"Learning":[1],"models":[2,18],"are":[3,19,41],"expensive":[4,9],"to":[5,22,46,71,126,150],"train:":[6],"they":[7],"require":[8],"high-compute":[10],"hardware":[11],"and":[12,95,119,164,171,181,200,204],"have":[13],"long":[14],"training":[15],"times.":[16],"Therefore,":[17],"extra":[20],"sensitive":[21],"program":[23],"faults":[24],"or":[25],"unexpected":[26,51],"system":[27,52],"crashes,":[28,223],"which":[29,112],"can":[30,146],"erase":[31],"hours":[32],"if":[33],"not":[34],"days":[35,151],"worth":[36],"of":[37,43,50,68,98,152,195],"work.":[38],"While":[39],"there":[40],"plenty":[42],"strategies":[44],"designed":[45],"mitigate":[47],"the":[48,54,66,69,96,128,159,169,210],"risk":[49],"downtime,":[53],"most":[55],"popular":[56],"strategy":[57],"in":[58],"machine":[59],"learning":[60],"is":[61,75,90],"called":[62],"checkpointing:":[63],"periodically":[64],"saving":[65],"state":[67],"model":[70],"persistent":[72,144],"storage.":[73],"Checkpointing":[74],"an":[76],"effective":[77],"strategy,":[78],"however,":[79],"it":[80],"requires":[81],"carefully":[82],"balancing":[83],"two":[84,177,189],"operations:":[85],"how":[86],"often":[87],"a":[88,100,192,196],"checkpoint":[89,101],"made":[91],"(the":[92],"checkpointing":[93,129,140,153,160,193,213],"schedule),":[94],"cost":[97],"creating":[99],"itself.":[102],"In":[103],"this":[104],"paper,":[105],"we":[106,185],"leverage":[107],"Python":[108,114],"Memory":[109,118,122],"Manager":[110],"(PyMM),":[111],"provides":[113],"support":[115],"for":[116,202,207],"Persistent":[117,121],"emerging":[120],"technology":[123],"(Optane":[124],"DC)":[125],"accelerate":[127],"operation":[130,161],"while":[131,224],"maintaining":[132],"crash":[133],"consistency.":[134],"We":[135,155,215],"first":[136],"show":[137,186],"that":[138,187,218],"when":[139],"models,":[141],"PyMM":[142,163],"with":[143,162,168],"memory":[145],"save":[147],"from":[148,222],"minutes":[149],"runtime.":[154],"then":[156],"further":[157],"optimize":[158],"demonstrate":[165],"our":[166,219],"approach":[167],"KMeans":[170,203],"Gaussian":[172],"Mixture":[173],"Model":[174],"algorithms":[175,190],"on":[176],"real-world":[178],"datasets,":[179],"MNIST":[180],"MusicNet.":[182],"Through":[183],"evaluation,":[184],"these":[188],"achieve":[191],"speedup":[194],"factor":[197],"between":[198],"10":[199],"75x":[201],"over":[205],"3x":[206],"GMM":[208],"against":[209],"current":[211],"state-of-the-art":[212],"approaches.":[214],"also":[216],"verify":[217],"solution":[220],"recovers":[221],"traditional":[225],"approaches":[226],"cannot.":[227]},"counts_by_year":[{"year":2024,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
