{"id":"https://openalex.org/W2891046591","doi":"https://doi.org/10.1145/3236367.3236383","title":"Transparent High-Speed Network Checkpoint/Restart in MPI","display_name":"Transparent High-Speed Network Checkpoint/Restart in MPI","publication_year":2018,"publication_date":"2018-09-19","ids":{"openalex":"https://openalex.org/W2891046591","doi":"https://doi.org/10.1145/3236367.3236383","mag":"2891046591"},"language":"en","primary_location":{"id":"doi:10.1145/3236367.3236383","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3236367.3236383","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 25th European MPI Users' Group Meeting","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5107746453","display_name":"Julien Adam","orcid":"https://orcid.org/0009-0005-1919-7020"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Julien Adam","raw_affiliation_strings":["ParaTools SAS, Bruy\u00e8res-le-Ch\u00e2tel, France"],"affiliations":[{"raw_affiliation_string":"ParaTools SAS, Bruy\u00e8res-le-Ch\u00e2tel, France","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024459582","display_name":"Jean-Baptiste Besnard","orcid":"https://orcid.org/0000-0001-6500-6786"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jean-Baptiste Besnard","raw_affiliation_strings":["ParaTools SAS, Bruy\u00e8res-le-Ch\u00e2tel, France"],"affiliations":[{"raw_affiliation_string":"ParaTools SAS, Bruy\u00e8res-le-Ch\u00e2tel, France","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029524538","display_name":"Allen D. Malony","orcid":"https://orcid.org/0000-0002-9598-7201"},"institutions":[{"id":"https://openalex.org/I4210132301","display_name":"ParaTools (United States)","ror":"https://ror.org/046fp0073","country_code":"US","type":"company","lineage":["https://openalex.org/I4210132301"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Allen D. Malony","raw_affiliation_strings":["ParaTools Inc., Eugene, USA"],"affiliations":[{"raw_affiliation_string":"ParaTools Inc., Eugene, USA","institution_ids":["https://openalex.org/I4210132301"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023240094","display_name":"Sameer Shende","orcid":"https://orcid.org/0000-0002-2592-669X"},"institutions":[{"id":"https://openalex.org/I4210132301","display_name":"ParaTools (United States)","ror":"https://ror.org/046fp0073","country_code":"US","type":"company","lineage":["https://openalex.org/I4210132301"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sameer Shende","raw_affiliation_strings":["ParaTools Inc., Eugene, USA"],"affiliations":[{"raw_affiliation_string":"ParaTools Inc., Eugene, USA","institution_ids":["https://openalex.org/I4210132301"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015641898","display_name":"Marc P\u00e9rache","orcid":"https://orcid.org/0000-0003-1615-2749"},"institutions":[{"id":"https://openalex.org/I4210101455","display_name":"CEA DAM \u00cele-de-France","ror":"https://ror.org/00kn4eb29","country_code":"FR","type":"government","lineage":["https://openalex.org/I4210101455"]},{"id":"https://openalex.org/I2738703131","display_name":"Commissariat \u00e0 l'\u00c9nergie Atomique et aux \u00c9nergies Alternatives","ror":"https://ror.org/00jjx8s55","country_code":"FR","type":"government","lineage":["https://openalex.org/I2738703131"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Marc P\u00e9rache","raw_affiliation_strings":["CEA, DAM, DIF, Arpajon, France"],"affiliations":[{"raw_affiliation_string":"CEA, DAM, DIF, Arpajon, France","institution_ids":["https://openalex.org/I4210101455","https://openalex.org/I2738703131"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102019685","display_name":"Patrick Carribault","orcid":"https://orcid.org/0009-0003-7210-0449"},"institutions":[{"id":"https://openalex.org/I4210101455","display_name":"CEA DAM \u00cele-de-France","ror":"https://ror.org/00kn4eb29","country_code":"FR","type":"government","lineage":["https://openalex.org/I4210101455"]},{"id":"https://openalex.org/I2738703131","display_name":"Commissariat \u00e0 l'\u00c9nergie Atomique et aux \u00c9nergies Alternatives","ror":"https://ror.org/00jjx8s55","country_code":"FR","type":"government","lineage":["https://openalex.org/I2738703131"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Patrick Carribault","raw_affiliation_strings":["CEA, DAM, DIF, Arpajon, France"],"affiliations":[{"raw_affiliation_string":"CEA, DAM, DIF, Arpajon, France","institution_ids":["https://openalex.org/I4210101455","https://openalex.org/I2738703131"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5078100396","display_name":"Julien Jaeger","orcid":"https://orcid.org/0000-0003-0084-1574"},"institutions":[{"id":"https://openalex.org/I2738703131","display_name":"Commissariat \u00e0 l'\u00c9nergie Atomique et aux \u00c9nergies Alternatives","ror":"https://ror.org/00jjx8s55","country_code":"FR","type":"government","lineage":["https://openalex.org/I2738703131"]},{"id":"https://openalex.org/I4210101455","display_name":"CEA DAM \u00cele-de-France","ror":"https://ror.org/00kn4eb29","country_code":"FR","type":"government","lineage":["https://openalex.org/I4210101455"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Julien Jaeger","raw_affiliation_strings":["CEA, DAM, DIF, Arpajon, France"],"affiliations":[{"raw_affiliation_string":"CEA, DAM, DIF, Arpajon, France","institution_ids":["https://openalex.org/I4210101455","https://openalex.org/I2738703131"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5107746453"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.7381,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.74849101,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/infiniband","display_name":"InfiniBand","score":0.8945015668869019},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8853777050971985},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.713547945022583},{"id":"https://openalex.org/keywords/message-passing-interface","display_name":"Message Passing Interface","score":0.6698131561279297},{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.6068377494812012},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.6012426614761353},{"id":"https://openalex.org/keywords/massively-parallel","display_name":"Massively parallel","score":0.5577105283737183},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5241668224334717},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5081884860992432},{"id":"https://openalex.org/keywords/crash","display_name":"Crash","score":0.48796766996383667},{"id":"https://openalex.org/keywords/message-passing","display_name":"Message passing","score":0.4782789647579193},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.47683730721473694},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.4444955289363861},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.4371996819972992},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.09902840852737427}],"concepts":[{"id":"https://openalex.org/C2781030343","wikidata":"https://www.wikidata.org/wiki/Q922437","display_name":"InfiniBand","level":2,"score":0.8945015668869019},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8853777050971985},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.713547945022583},{"id":"https://openalex.org/C166782233","wikidata":"https://www.wikidata.org/wiki/Q127879","display_name":"Message Passing Interface","level":3,"score":0.6698131561279297},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.6068377494812012},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.6012426614761353},{"id":"https://openalex.org/C190475519","wikidata":"https://www.wikidata.org/wiki/Q544384","display_name":"Massively parallel","level":2,"score":0.5577105283737183},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5241668224334717},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5081884860992432},{"id":"https://openalex.org/C183469790","wikidata":"https://www.wikidata.org/wiki/Q333501","display_name":"Crash","level":2,"score":0.48796766996383667},{"id":"https://openalex.org/C854659","wikidata":"https://www.wikidata.org/wiki/Q1859284","display_name":"Message passing","level":2,"score":0.4782789647579193},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.47683730721473694},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.4444955289363861},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.4371996819972992},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.09902840852737427},{"id":"https://openalex.org/C157915830","wikidata":"https://www.wikidata.org/wiki/Q2928001","display_name":"Bubble","level":2,"score":0.0},{"id":"https://openalex.org/C129307140","wikidata":"https://www.wikidata.org/wiki/Q6795880","display_name":"Maximum bubble pressure method","level":3,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3236367.3236383","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3236367.3236383","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 25th European MPI Users' Group Meeting","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.46000000834465027,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W163470805","https://openalex.org/W170768260","https://openalex.org/W185651294","https://openalex.org/W1522250664","https://openalex.org/W1573548168","https://openalex.org/W1576341769","https://openalex.org/W1583784092","https://openalex.org/W1603470798","https://openalex.org/W1915019682","https://openalex.org/W1981432246","https://openalex.org/W1984564341","https://openalex.org/W1986190431","https://openalex.org/W2008323179","https://openalex.org/W2017060126","https://openalex.org/W2021234574","https://openalex.org/W2034881140","https://openalex.org/W2067021722","https://openalex.org/W2079577430","https://openalex.org/W2104486653","https://openalex.org/W2104679543","https://openalex.org/W2116115793","https://openalex.org/W2126163873","https://openalex.org/W2128577831","https://openalex.org/W2139244298","https://openalex.org/W2144108682","https://openalex.org/W2155204206","https://openalex.org/W2521708680","https://openalex.org/W2534641909","https://openalex.org/W2535102629","https://openalex.org/W4206638755","https://openalex.org/W4239427345","https://openalex.org/W4365800003"],"related_works":["https://openalex.org/W1597650818","https://openalex.org/W2124048060","https://openalex.org/W1902983110","https://openalex.org/W4386915331","https://openalex.org/W2073684863","https://openalex.org/W2246013950","https://openalex.org/W2378910916","https://openalex.org/W2119634093","https://openalex.org/W1970720081","https://openalex.org/W2751263050"],"abstract_inverted_index":{"Fault-tolerance":[0],"has":[1],"always":[2],"been":[3],"an":[4],"important":[5],"topic":[6],"when":[7],"it":[8,161],"comes":[9],"to":[10,24,62,100],"running":[11],"massively":[12],"parallel":[13],"programs":[14],"at":[15],"scale.":[16],"Statistically,":[17],"hardware":[18],"and":[19,82,123,138,158],"software":[20],"failures":[21],"are":[22],"expected":[23],"occur":[25],"more":[26,41,168],"often":[27],"on":[28,117,126],"systems":[29],"gathering":[30],"millions":[31],"of":[32],"computing":[33,42],"units.":[34],"Moreover,":[35],"the":[36,40,55,68,108,132,164],"larger":[37],"jobs":[38],"are,":[39],"hours":[43],"would":[44],"be":[45,97,163],"wasted":[46],"by":[47],"a":[48,104],"crash.":[49],"In":[50],"this":[51],"paper,":[52],"we":[53],"describe":[54],"work":[56,77],"done":[57],"in":[58],"our":[59,76],"MPI":[60,69,101,109,118,154],"runtime":[61],"enable":[63],"transparent":[64,93],"checkpointing":[65,94],"mechanism.":[66],"Unlike":[67],"4.0":[70],"User-Level":[71],"Failure":[72],"Mitigation":[73],"(ULFM)":[74],"interface,":[75],"targets":[78],"solely":[79],"Checkpoint/Restart":[80],"(C/R)":[81],"ignores":[83],"wider":[84],"features":[85],"such":[86,120],"as":[87,121],"resiliency.":[88],"We":[89,144],"show":[90,159],"how":[91,160],"existing":[92],"methods":[95],"can":[96],"practically":[98],"applied":[99],"implementations":[102],"given":[103],"sufficient":[105],"collaboration":[106],"from":[107],"runtime.":[110],"Our":[111],"C/R":[112],"technique":[113],"is":[114,135,141,156],"then":[115],"measured":[116],"benchmarks":[119],"IMB":[122],"Lulesh":[124],"relying":[125],"Infiniband":[127],"high-speed":[128],"network,":[129],"demonstrating":[130],"that":[131,139,146],"chosen":[133],"approach":[134],"sufficiently":[136],"general":[137],"performance":[140],"mostly":[142],"preserved.":[143],"argue":[145],"enabling":[147],"fault-tolerance":[148],"without":[149],"any":[150],"modification":[151],"inside":[152],"target":[153],"applications":[155],"possible,":[157],"could":[162],"first":[165],"step":[166],"for":[167],"integrated":[169],"resiliency":[170],"combined":[171],"with":[172],"failure":[173],"mitigation":[174],"like":[175],"ULFM.":[176]},"counts_by_year":[{"year":2023,"cited_by_count":1},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":2},{"year":2018,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
