{"id":"https://openalex.org/W2036641664","doi":"https://doi.org/10.1177/1094342009347767","title":"Toward Exascale Resilience","display_name":"Toward Exascale Resilience","publication_year":2009,"publication_date":"2009-09-17","ids":{"openalex":"https://openalex.org/W2036641664","doi":"https://doi.org/10.1177/1094342009347767","mag":"2036641664"},"language":"en","primary_location":{"id":"doi:10.1177/1094342009347767","is_oa":false,"landing_page_url":"https://doi.org/10.1177/1094342009347767","pdf_url":null,"source":{"id":"https://openalex.org/S60606485","display_name":"The International Journal of High Performance Computing Applications","issn_l":"1094-3420","issn":["1094-3420","1741-2846"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320017","host_organization_name":"SAGE Publishing","host_organization_lineage":["https://openalex.org/P4310320017"],"host_organization_lineage_names":["SAGE Publishing"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The International Journal of High Performance Computing Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046613458","display_name":"Franck Cappello","orcid":"https://orcid.org/0000-0002-7890-3934"},"institutions":[{"id":"https://openalex.org/I1326498283","display_name":"Institut national de recherche en informatique et en automatique","ror":"https://ror.org/02kvxyf05","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1326498283"]},{"id":"https://openalex.org/I4210144804","display_name":"Laboratoire de Recherche en Informatique","ror":"https://ror.org/04e3ktk27","country_code":"FR","type":"facility","lineage":["https://openalex.org/I102197404","https://openalex.org/I1294671590","https://openalex.org/I1326498283","https://openalex.org/I4210144804","https://openalex.org/I4210159245"]}],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Franck Cappello","raw_affiliation_strings":["INRIA, LABORATOIRE EN RECHERCHE INFORMATIQUE, FRANCE,"],"affiliations":[{"raw_affiliation_string":"INRIA, LABORATOIRE EN RECHERCHE INFORMATIQUE, FRANCE,","institution_ids":["https://openalex.org/I4210144804","https://openalex.org/I1326498283"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113990568","display_name":"Al Geist","orcid":null},"institutions":[{"id":"https://openalex.org/I1289243028","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56","country_code":"US","type":"facility","lineage":["https://openalex.org/I1289243028","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I4210159294"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Al Geist","raw_affiliation_strings":["OAK RIDGE NATIONAL LABORATORY, TN, USA"],"affiliations":[{"raw_affiliation_string":"OAK RIDGE NATIONAL LABORATORY, TN, USA","institution_ids":["https://openalex.org/I1289243028"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017241944","display_name":"William Gropp","orcid":"https://orcid.org/0000-0003-2905-3029"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bill Gropp","raw_affiliation_strings":["DEPARTMENT OF COMPUTER SCIENCE, UNIVERSITY OF ILLINOIS\rAT URBANA-CHAMPAIGN, USA"],"affiliations":[{"raw_affiliation_string":"DEPARTMENT OF COMPUTER SCIENCE, UNIVERSITY OF ILLINOIS\rAT URBANA-CHAMPAIGN, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051465480","display_name":"Laxmikant V. Kal\u00e9","orcid":"https://orcid.org/0000-0001-9673-8445"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Laxmikant Kale","raw_affiliation_strings":["DEPARTMENT OF COMPUTER SCIENCE, UNIVERSITY OF ILLINOIS\rAT URBANA-CHAMPAIGN, USA"],"affiliations":[{"raw_affiliation_string":"DEPARTMENT OF COMPUTER SCIENCE, UNIVERSITY OF ILLINOIS\rAT URBANA-CHAMPAIGN, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053243747","display_name":"Bill Kramer","orcid":null},"institutions":[{"id":"https://openalex.org/I4210151627","display_name":"National Energy Research Scientific Computing Center","ror":"https://ror.org/05v3mvq14","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521","https://openalex.org/I4210151627"]},{"id":"https://openalex.org/I148283060","display_name":"Lawrence Berkeley National Laboratory","ror":"https://ror.org/02jbv0t02","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bill Kramer","raw_affiliation_strings":["NERSC, LAWRENCE BERKELEY NATIONAL LABORATORY, IL, USA","NERSC, LAWRENCE BERKELEY NATIONAL LABORATORY, IL, USA#TAB#"],"affiliations":[{"raw_affiliation_string":"NERSC, LAWRENCE BERKELEY NATIONAL LABORATORY, IL, USA","institution_ids":["https://openalex.org/I4210151627"]},{"raw_affiliation_string":"NERSC, LAWRENCE BERKELEY NATIONAL LABORATORY, IL, USA#TAB#","institution_ids":["https://openalex.org/I148283060"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5041731995","display_name":"Marc Snir","orcid":"https://orcid.org/0000-0002-3504-2468"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Marc Snir","raw_affiliation_strings":["DEPARTMENT OF COMPUTER SCIENCE, UNIVERSITY OF ILLINOIS\rAT URBANA-CHAMPAIGN, USA"],"affiliations":[{"raw_affiliation_string":"DEPARTMENT OF COMPUTER SCIENCE, UNIVERSITY OF ILLINOIS\rAT URBANA-CHAMPAIGN, USA","institution_ids":["https://openalex.org/I157725225"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5046613458"],"corresponding_institution_ids":["https://openalex.org/I1326498283","https://openalex.org/I4210144804"],"apc_list":null,"apc_paid":null,"fwci":20.2799,"has_fulltext":false,"cited_by_count":334,"citation_normalized_percentile":{"value":0.99514676,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":"23","issue":"4","first_page":"374","last_page":"388"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9884999990463257,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9860000014305115,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/petascale-computing","display_name":"Petascale computing","score":0.8276516199111938},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8248664140701294},{"id":"https://openalex.org/keywords/resilience","display_name":"Resilience (materials science)","score":0.7271317839622498},{"id":"https://openalex.org/keywords/exascale-computing","display_name":"Exascale computing","score":0.7159207463264465},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.6464006900787354},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5766811966896057},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5674301385879517},{"id":"https://openalex.org/keywords/massively-parallel","display_name":"Massively parallel","score":0.48620834946632385},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.45671430230140686},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.34653934836387634},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.06914344429969788}],"concepts":[{"id":"https://openalex.org/C185410017","wikidata":"https://www.wikidata.org/wiki/Q7171778","display_name":"Petascale computing","level":3,"score":0.8276516199111938},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8248664140701294},{"id":"https://openalex.org/C2779585090","wikidata":"https://www.wikidata.org/wiki/Q3457762","display_name":"Resilience (materials science)","level":2,"score":0.7271317839622498},{"id":"https://openalex.org/C2778837361","wikidata":"https://www.wikidata.org/wiki/Q2450880","display_name":"Exascale computing","level":3,"score":0.7159207463264465},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.6464006900787354},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5766811966896057},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5674301385879517},{"id":"https://openalex.org/C190475519","wikidata":"https://www.wikidata.org/wiki/Q544384","display_name":"Massively parallel","level":2,"score":0.48620834946632385},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.45671430230140686},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.34653934836387634},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.06914344429969788},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1177/1094342009347767","is_oa":false,"landing_page_url":"https://doi.org/10.1177/1094342009347767","pdf_url":null,"source":{"id":"https://openalex.org/S60606485","display_name":"The International Journal of High Performance Computing Applications","issn_l":"1094-3420","issn":["1094-3420","1741-2846"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320017","host_organization_name":"SAGE Publishing","host_organization_lineage":["https://openalex.org/P4310320017"],"host_organization_lineage_names":["SAGE Publishing"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The International Journal of High Performance Computing Applications","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W4013337","https://openalex.org/W28511425","https://openalex.org/W169659540","https://openalex.org/W1493735863","https://openalex.org/W1506451579","https://openalex.org/W1509835855","https://openalex.org/W1568637577","https://openalex.org/W1585274809","https://openalex.org/W1847519410","https://openalex.org/W1987608073","https://openalex.org/W1996839061","https://openalex.org/W2016737685","https://openalex.org/W2053352232","https://openalex.org/W2083613288","https://openalex.org/W2100970777","https://openalex.org/W2105039796","https://openalex.org/W2107263349","https://openalex.org/W2110137598","https://openalex.org/W2111132249","https://openalex.org/W2123646449","https://openalex.org/W2131053137","https://openalex.org/W2139156547","https://openalex.org/W2145071552","https://openalex.org/W2146924779","https://openalex.org/W2146948159","https://openalex.org/W2155204206","https://openalex.org/W2156514327","https://openalex.org/W2158907675","https://openalex.org/W2159161022","https://openalex.org/W2291650579","https://openalex.org/W2296772319","https://openalex.org/W2497735908","https://openalex.org/W2731397968","https://openalex.org/W3182208082","https://openalex.org/W4213383978","https://openalex.org/W4230426658","https://openalex.org/W4232707757","https://openalex.org/W4252679991"],"related_works":["https://openalex.org/W2021702679","https://openalex.org/W3038449658","https://openalex.org/W1582746211","https://openalex.org/W3129378740","https://openalex.org/W2266027327","https://openalex.org/W2249929881","https://openalex.org/W4289494037","https://openalex.org/W1569809235","https://openalex.org/W2278366184","https://openalex.org/W1486544172"],"abstract_inverted_index":{"Over":[0],"the":[1,19,54,84,102,110,124,152,160,170,176],"past":[2],"few":[3],"years":[4,167],"resilience":[5],"has":[6,162],"became":[7],"a":[8,36,50,116,133],"major":[9],"issue":[10],"for":[11,87,104,129],"high-performance":[12],"computing":[13],"(HPC)":[14],"systems,":[15,62],"in":[16,18,191],"particular":[17],"perspective":[20],"of":[21,41,59,73,115,121,126,156,185,189],"large":[22,61],"petascale":[23],"systems":[24,30,68,131,196],"and":[25,57,106,179,197],"future":[26],"exascale":[27,67,157],"systems.":[28,158],"These":[29],"will":[31,69,98,108],"typically":[32],"gather":[33],"from":[34],"half":[35],"million":[37],"to":[38,49,113,144,165,168],"several":[39,186],"millions":[40],"central":[42],"processing":[43],"unit":[44],"(CPU)":[45],"cores":[46],"running":[47],"up":[48],"billion":[51],"threads.":[52],"From":[53],"current":[55,85],"knowledge":[56],"observations":[58,178],"existing":[60],"it":[63],"is":[64,80],"anticipated":[65,82],"that":[66,83],"experience":[70],"various":[71],"kind":[72],"faults":[74],"many":[75],"times":[76],"per":[77],"day.":[78],"It":[79],"also":[81],"approach":[86],"resilience,":[88],"which":[89,139],"relies":[90],"on":[91],"automatic":[92],"or":[93],"application":[94],"level":[95],"checkpoint/":[96],"restart,":[97],"not":[99],"work":[100],"because":[101],"time":[103,112],"checkpointing":[105],"restarting":[107],"exceed":[109],"mean":[111],"failure":[114],"full":[117],"system.":[118],"This":[119,172],"set":[120],"projections":[122],"leaves":[123],"community":[125,161],"fault":[127],"tolerance":[128],"HPC":[130,190],"with":[132],"difficult":[134],"challenge:":[135],"finding":[136],"new":[137],"approaches,":[138],"are":[140],"possibly":[141],"radically":[142],"disruptive,":[143],"run":[145],"applications":[146],"until":[147],"their":[148],"normal":[149],"termination,":[150],"despite":[151],"essentially":[153],"unstable":[154],"nature":[155],"Yet,":[159],"only":[163],"five":[164],"six":[166],"solve":[169],"problem.":[171],"white":[173],"paper":[174],"synthesizes":[175],"motivations,":[177],"research":[180],"issues":[181],"considered":[182],"as":[183],"determinant":[184],"complimentary":[187],"experts":[188],"applications,":[192],"programming":[193],"models,":[194],"distributed":[195],"system":[198],"management.":[199]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":7},{"year":2020,"cited_by_count":10},{"year":2019,"cited_by_count":24},{"year":2018,"cited_by_count":32},{"year":2017,"cited_by_count":31},{"year":2016,"cited_by_count":43},{"year":2015,"cited_by_count":51},{"year":2014,"cited_by_count":37},{"year":2013,"cited_by_count":32},{"year":2012,"cited_by_count":27}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
