{"id":"https://openalex.org/W2017060126","doi":"https://doi.org/10.1109/sc.2014.78","title":"Exploring Automatic, Online Failure Recovery for Scientific Applications at Extreme Scales","display_name":"Exploring Automatic, Online Failure Recovery for Scientific Applications at Extreme Scales","publication_year":2014,"publication_date":"2014-11-01","ids":{"openalex":"https://openalex.org/W2017060126","doi":"https://doi.org/10.1109/sc.2014.78","mag":"2017060126"},"language":"en","primary_location":{"id":"doi:10.1109/sc.2014.78","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sc.2014.78","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"SC14: International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027432313","display_name":"Marc Gamell","orcid":null},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Marc Gamell","raw_affiliation_strings":["NSF Cloud and Autonomic Computing Center, Rutgers University, Piscataway, NJ, USA","Rutgers University, Piscataway, NJ"],"affiliations":[{"raw_affiliation_string":"NSF Cloud and Autonomic Computing Center, Rutgers University, Piscataway, NJ, USA","institution_ids":["https://openalex.org/I102322142"]},{"raw_affiliation_string":"Rutgers University, Piscataway, NJ","institution_ids":["https://openalex.org/I102322142"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061630529","display_name":"Daniel S. Katz","orcid":"https://orcid.org/0000-0001-5934-7525"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Daniel S. Katz","raw_affiliation_strings":["Computation Institute, Argonne National Laboratory, Chicago, IL, USA","University of Chicago & Argonne National Laboratory, Chicago, IL#TAB#"],"affiliations":[{"raw_affiliation_string":"Computation Institute, Argonne National Laboratory, Chicago, IL, USA","institution_ids":["https://openalex.org/I1282105669"]},{"raw_affiliation_string":"University of Chicago & Argonne National Laboratory, Chicago, IL#TAB#","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012593852","display_name":"Hemanth Kolla","orcid":"https://orcid.org/0000-0003-4969-5870"},"institutions":[{"id":"https://openalex.org/I192454743","display_name":"Sandia National Laboratories California","ror":"https://ror.org/058m7ey48","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1330989302","https://openalex.org/I192454743","https://openalex.org/I198811213","https://openalex.org/I198811213","https://openalex.org/I4210104735"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hemanth Kolla","raw_affiliation_strings":["Scalable Modeling and Analysis Department, Sandia National Laboratories, Livermore, CA, USA","SANDIA NATIONAL LABORATORIES, LIVERMORE, CA"],"affiliations":[{"raw_affiliation_string":"Scalable Modeling and Analysis Department, Sandia National Laboratories, Livermore, CA, USA","institution_ids":["https://openalex.org/I192454743"]},{"raw_affiliation_string":"SANDIA NATIONAL LABORATORIES, LIVERMORE, CA","institution_ids":["https://openalex.org/I192454743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101693761","display_name":"Jacqueline Chen","orcid":"https://orcid.org/0000-0002-9896-6262"},"institutions":[{"id":"https://openalex.org/I192454743","display_name":"Sandia National Laboratories California","ror":"https://ror.org/058m7ey48","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1330989302","https://openalex.org/I192454743","https://openalex.org/I198811213","https://openalex.org/I198811213","https://openalex.org/I4210104735"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jacqueline Chen","raw_affiliation_strings":["Combustion Research Facility, Sandia National Laboratories, Livermore, CA, USA","SANDIA NATIONAL LABORATORIES, LIVERMORE, CA"],"affiliations":[{"raw_affiliation_string":"Combustion Research Facility, Sandia National Laboratories, Livermore, CA, USA","institution_ids":["https://openalex.org/I192454743"]},{"raw_affiliation_string":"SANDIA NATIONAL LABORATORIES, LIVERMORE, CA","institution_ids":["https://openalex.org/I192454743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030373337","display_name":"Scott Klasky","orcid":"https://orcid.org/0000-0003-3559-5772"},"institutions":[{"id":"https://openalex.org/I1289243028","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56","country_code":"US","type":"facility","lineage":["https://openalex.org/I1289243028","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I4210159294"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Scott Klasky","raw_affiliation_strings":["Oak Ridge National Laboratories, Oak Ridge, TN, USA","Oak Ridge National Lab,,,Oak Ridge,TN,"],"affiliations":[{"raw_affiliation_string":"Oak Ridge National Laboratories, Oak Ridge, TN, USA","institution_ids":["https://openalex.org/I1289243028"]},{"raw_affiliation_string":"Oak Ridge National Lab,,,Oak Ridge,TN,","institution_ids":["https://openalex.org/I1289243028"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5060153432","display_name":"Manish Parashar","orcid":"https://orcid.org/0000-0003-0983-7408"},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Manish Parashar","raw_affiliation_strings":["NSF Cloud and Autonomic Computing Center, Rutgers University, Piscataway, NJ, USA","Rutgers University, Piscataway, NJ"],"affiliations":[{"raw_affiliation_string":"NSF Cloud and Autonomic Computing Center, Rutgers University, Piscataway, NJ, USA","institution_ids":["https://openalex.org/I102322142"]},{"raw_affiliation_string":"Rutgers University, Piscataway, NJ","institution_ids":["https://openalex.org/I102322142"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5027432313"],"corresponding_institution_ids":["https://openalex.org/I102322142"],"apc_list":null,"apc_paid":null,"fwci":7.9306,"has_fulltext":false,"cited_by_count":81,"citation_normalized_percentile":{"value":0.97572834,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"895","last_page":"906"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9896000027656555,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7723648548126221},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5238295197486877},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4812890589237213},{"id":"https://openalex.org/keywords/resilience","display_name":"Resilience (materials science)","score":0.47389617562294006},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.442608505487442},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4266192317008972},{"id":"https://openalex.org/keywords/titan","display_name":"Titan (rocket family)","score":0.42505279183387756},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.4197829067707062},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.367680162191391}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7723648548126221},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5238295197486877},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4812890589237213},{"id":"https://openalex.org/C2779585090","wikidata":"https://www.wikidata.org/wiki/Q3457762","display_name":"Resilience (materials science)","level":2,"score":0.47389617562294006},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.442608505487442},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4266192317008972},{"id":"https://openalex.org/C50805821","wikidata":"https://www.wikidata.org/wiki/Q1136670","display_name":"Titan (rocket family)","level":2,"score":0.42505279183387756},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.4197829067707062},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.367680162191391},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C146978453","wikidata":"https://www.wikidata.org/wiki/Q3798668","display_name":"Aerospace engineering","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C66938386","wikidata":"https://www.wikidata.org/wiki/Q633538","display_name":"Structural engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/sc.2014.78","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sc.2014.78","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"SC14: International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320306250","display_name":"Battelle","ror":"https://ror.org/01h5tnr73"},{"id":"https://openalex.org/F4320316892","display_name":"UT-Battelle","ror":"https://ror.org/04nza6677"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":72,"referenced_works":["https://openalex.org/W141129880","https://openalex.org/W170768260","https://openalex.org/W1545979834","https://openalex.org/W1576341769","https://openalex.org/W1923741182","https://openalex.org/W1959638903","https://openalex.org/W1965091139","https://openalex.org/W1978347128","https://openalex.org/W1981432246","https://openalex.org/W1981812667","https://openalex.org/W1984564341","https://openalex.org/W1985713815","https://openalex.org/W1986905947","https://openalex.org/W1991421378","https://openalex.org/W1993660990","https://openalex.org/W2000739449","https://openalex.org/W2014332384","https://openalex.org/W2014594876","https://openalex.org/W2017059273","https://openalex.org/W2018060613","https://openalex.org/W2031260715","https://openalex.org/W2033656974","https://openalex.org/W2036641664","https://openalex.org/W2045611838","https://openalex.org/W2049459394","https://openalex.org/W2077546582","https://openalex.org/W2078837027","https://openalex.org/W2083613288","https://openalex.org/W2095487435","https://openalex.org/W2100970777","https://openalex.org/W2101852949","https://openalex.org/W2103308354","https://openalex.org/W2105524676","https://openalex.org/W2116115793","https://openalex.org/W2116768598","https://openalex.org/W2117667603","https://openalex.org/W2125554779","https://openalex.org/W2128577831","https://openalex.org/W2131053137","https://openalex.org/W2132860930","https://openalex.org/W2137512117","https://openalex.org/W2148575324","https://openalex.org/W2149053506","https://openalex.org/W2155204206","https://openalex.org/W2155662278","https://openalex.org/W2160779498","https://openalex.org/W2167259479","https://openalex.org/W2170163131","https://openalex.org/W2170454619","https://openalex.org/W2171007672","https://openalex.org/W2296772319","https://openalex.org/W2336067652","https://openalex.org/W2730779250","https://openalex.org/W2898724554","https://openalex.org/W3141239549","https://openalex.org/W3150262005","https://openalex.org/W4234514454","https://openalex.org/W4247768107","https://openalex.org/W4249693612","https://openalex.org/W4252909492","https://openalex.org/W4256364678","https://openalex.org/W4285719527","https://openalex.org/W6640893626","https://openalex.org/W6646916258","https://openalex.org/W6670243553","https://openalex.org/W6675052984","https://openalex.org/W6682581173","https://openalex.org/W6685186519","https://openalex.org/W6702820436","https://openalex.org/W6740868767","https://openalex.org/W6756182242","https://openalex.org/W7046307239"],"related_works":["https://openalex.org/W2408545863","https://openalex.org/W643179351","https://openalex.org/W2740469715","https://openalex.org/W2905265805","https://openalex.org/W2162805293","https://openalex.org/W2013629274","https://openalex.org/W3169395362","https://openalex.org/W2064207836","https://openalex.org/W2025174650","https://openalex.org/W2973472379"],"abstract_inverted_index":{"Application":[0],"resilience":[1],"is":[2,42],"a":[3,55],"key":[4],"challenge":[5],"that":[6],"must":[7],"be":[8],"addressed":[9],"in":[10,67],"order":[11],"to":[12,45,47,99,135],"realize":[13],"the":[14,30,36,73,100,117,123],"exascale":[15],"vision.":[16],"Process/node":[17],"failures,":[18,23,84],"an":[19,68],"important":[20],"class":[21],"of":[22],"are":[24],"typically":[25],"handled":[26],"today":[27],"by":[28],"terminating":[29],"job":[31],"and":[32,75,94],"restarting":[33],"it":[34],"from":[35,60],"last":[37],"stored":[38],"checkpoint.":[39],"This":[40],"approach":[41],"not":[43],"expected":[44],"scale":[46],"exascale.":[48],"In":[49],"this":[50],"paper":[51],"we":[52,130],"present":[53],"Fenix,":[54],"framework":[56],"for":[57,63,81],"enabling":[58],"recovery":[59],"process/node/blade/cabinet":[61],"failures":[62],"MPI-based":[64],"parallel":[65],"applications":[66],"online":[69],"(i.e.,":[70],"Without":[71],"disrupting":[72],"job)":[74],"transparent":[76],"manner.":[77],"Fenix":[78,107],"provides":[79],"mechanisms":[80],"transparently":[82],"capturing":[83],"re-spawning":[85],"new":[86],"processes,":[87],"fixing":[88],"failed":[89],"communicators,":[90],"restoring":[91],"application":[92],"state,":[93],"returning":[95],"execution":[96],"control":[97],"back":[98],"application.":[101],"To":[102],"enable":[103],"automatic":[104],"data":[105],"recovery,":[106],"relies":[108],"on":[109,122],"application-driven,":[110],"diskless,":[111],"implicitly":[112],"coordinated":[113],"check":[114],"pointing.":[115],"Using":[116],"S3D":[118],"combustion":[119],"simulation":[120],"running":[121],"Titan":[124],"Cray-XK7":[125],"production":[126],"system":[127],"at":[128],"ORNL,":[129],"experimentally":[131],"demonstrate":[132],"Felix's":[133],"ability":[134],"tolerate":[136],"high":[137],"failure":[138],"rates":[139],"(e.g.,":[140],"More":[141],"than":[142],"one":[143],"per":[144],"minute)":[145],"with":[146],"low":[147],"overhead":[148],"while":[149],"sustaining":[150],"performance.":[151]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":7},{"year":2021,"cited_by_count":8},{"year":2020,"cited_by_count":15},{"year":2019,"cited_by_count":5},{"year":2018,"cited_by_count":11},{"year":2017,"cited_by_count":12},{"year":2016,"cited_by_count":4},{"year":2015,"cited_by_count":7}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
