{"id":"https://openalex.org/W3178420079","doi":"https://doi.org/10.1109/tpds.2021.3096055","title":"Near-Zero Downtime Recovery From Transient-Error-Induced Crashes","display_name":"Near-Zero Downtime Recovery From Transient-Error-Induced Crashes","publication_year":2021,"publication_date":"2021-07-09","ids":{"openalex":"https://openalex.org/W3178420079","doi":"https://doi.org/10.1109/tpds.2021.3096055","mag":"3178420079"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2021.3096055","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2021.3096055","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100408398","display_name":"Chao Chen","orcid":"https://orcid.org/0000-0003-1960-4042"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Chao Chen","raw_affiliation_strings":["Amazon Science, Santa Clara, CA, USA"],"raw_orcid":"https://orcid.org/0000-0003-1960-4042","affiliations":[{"raw_affiliation_string":"Amazon Science, Santa Clara, CA, USA","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064465394","display_name":"Greg Eisenhauer","orcid":null},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Greg Eisenhauer","raw_affiliation_strings":["School of Computer Science, Georgia Institute of Technology, Atlanta, GA, USA"],"raw_orcid":"https://orcid.org/0000-0002-2070-043X","affiliations":[{"raw_affiliation_string":"School of Computer Science, Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5061235810","display_name":"Santosh Pande","orcid":"https://orcid.org/0000-0001-6723-8062"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Santosh Pande","raw_affiliation_strings":["School of Computer Science, Georgia Institute of Technology, Atlanta, GA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science, Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100408398"],"corresponding_institution_ids":["https://openalex.org/I1311688040"],"apc_list":null,"apc_paid":null,"fwci":0.2033,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.50567453,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":"33","issue":"4","first_page":"765","last_page":"778"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7248654961585999},{"id":"https://openalex.org/keywords/downtime","display_name":"Downtime","score":0.7182371020317078},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.628600001335144},{"id":"https://openalex.org/keywords/transient","display_name":"Transient (computer programming)","score":0.5561171174049377},{"id":"https://openalex.org/keywords/crash","display_name":"Crash","score":0.5500350594520569},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.5163847208023071},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.46824562549591064},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.46413418650627136},{"id":"https://openalex.org/keywords/real-time-computing","display_name":"Real-time computing","score":0.34966588020324707},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.22846147418022156},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.14158546924591064},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.11790555715560913}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7248654961585999},{"id":"https://openalex.org/C180591934","wikidata":"https://www.wikidata.org/wiki/Q1253369","display_name":"Downtime","level":2,"score":0.7182371020317078},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.628600001335144},{"id":"https://openalex.org/C2780799671","wikidata":"https://www.wikidata.org/wiki/Q17087362","display_name":"Transient (computer programming)","level":2,"score":0.5561171174049377},{"id":"https://openalex.org/C183469790","wikidata":"https://www.wikidata.org/wiki/Q333501","display_name":"Crash","level":2,"score":0.5500350594520569},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.5163847208023071},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.46824562549591064},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.46413418650627136},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.34966588020324707},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.22846147418022156},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.14158546924591064},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.11790555715560913}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2021.3096055","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2021.3096055","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1978082708","https://openalex.org/W1978347128","https://openalex.org/W1988070283","https://openalex.org/W1997843200","https://openalex.org/W2021337678","https://openalex.org/W2021595237","https://openalex.org/W2036641664","https://openalex.org/W2037523067","https://openalex.org/W2043422726","https://openalex.org/W2080640552","https://openalex.org/W2088552220","https://openalex.org/W2105524676","https://openalex.org/W2110137598","https://openalex.org/W2128511938","https://openalex.org/W2134320686","https://openalex.org/W2336984031","https://openalex.org/W2346318102","https://openalex.org/W2486202470","https://openalex.org/W2563678330","https://openalex.org/W2565180819","https://openalex.org/W2565481669","https://openalex.org/W2631994569","https://openalex.org/W2647773517","https://openalex.org/W2767321582","https://openalex.org/W2767966988","https://openalex.org/W2808246216","https://openalex.org/W2903494439","https://openalex.org/W2986347693","https://openalex.org/W3014653994","https://openalex.org/W3034155241","https://openalex.org/W4229822017","https://openalex.org/W4232172996","https://openalex.org/W4233556486","https://openalex.org/W4246684434","https://openalex.org/W4254699165","https://openalex.org/W6649817641","https://openalex.org/W6680487673"],"related_works":["https://openalex.org/W2046276983","https://openalex.org/W2954002293","https://openalex.org/W2078264086","https://openalex.org/W2892741875","https://openalex.org/W2164372000","https://openalex.org/W1862835629","https://openalex.org/W2099111379","https://openalex.org/W2136799148","https://openalex.org/W2897533804","https://openalex.org/W2890506991"],"abstract_inverted_index":{"Due":[0],"to":[1,40,54,61,73,98,115,146,159,192,207],"the":[2,25,86,125,148,209,227,297,353,359],"system":[3],"scaling,":[4],"<italic":[5],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[6,107,130,172,218,236,241,250,256,279,292,320,348],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">transient":[7],"errors</i>":[8],"caused":[9],"by":[10,178,302],"external":[11],"noise,":[12],"e.g.,":[13],"heat":[14],"fluxes":[15],"and":[16,27,91,117,189,245,288,338,355],"particle":[17],"strikes,":[18],"have":[19],"become":[20],"a":[21,110,133,246,330],"growing":[22],"concern":[23],"for":[24,79,138,212,314],"current":[26],"upcoming":[28],"exa-scale":[29,364],"high-performance-computing":[30],"(HPC)":[31],"systems.":[32,365],"Applications":[33],"running":[34,284],"on":[35,186,260,285],"these":[36],"systems":[37,81],"are":[38,67,144,205],"expected":[39],"experience":[41],"transient":[42],"errors":[43,66,267,301],"more":[44],"frequently":[45],"than":[46],"ever":[47],"before,":[48],"which":[49,316],"will":[50],"either":[51],"lead":[52],"them":[53,60],"generate":[55],"incorrect":[56],"outputs":[57],"or":[58],"cause":[59],"crash.":[62],"However,":[63],"since":[64],"such":[65,342],"still":[68],"quite":[69],"rare":[70],"as":[71,224,226],"compared":[72],"no-fault":[74,89],"cases,":[75],"desirable":[76],"solutions":[77],"call":[78],"low/no-overhead":[80],"that":[82,290,318],"do":[83],"not":[84],"compromise":[85],"performance":[87],"under":[88],"conditions":[90],"also":[92,276],"allow":[93],"very":[94,331],"fast":[95],"fault":[96],"recovery":[97,136,142,169,195,345],"minimize":[99],"downtime.":[100,274],"In":[101],"this":[102,198],"article,":[103],"we":[104,308],"present":[105,309],"<bold":[106,129,171,217,235,240,249,255,278,291,319,347],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">IterPro</b>":[108,131,173,219,237,257,280,293,321,349],",":[109],"light-weight":[111],"compiler-assisted":[112],"resilience":[113,213,360],"technique":[114],"quickly":[116],"accurately":[118],"recover":[119,259],"processes":[120],"from":[121],"transient-error-induced":[122],"crashes.":[123],"During":[124,231],"compilation":[126],"of":[127,135,155,164,265,270,299,324,335,358],"applications,":[128],"constructs":[132],"set":[134],"kernels":[137,143],"crash-prone":[139],"instructions.":[140],"These":[141],"executed":[145],"repair":[147],"corrupted":[149],"process":[150],"states":[151],"on-the-fly":[152],"upon":[153],"occurrences":[154],"errors,":[156],"enabling":[157],"applications":[158],"continue":[160],"their":[161,232],"executions":[162],"instead":[163],"being":[165],"terminated.":[166],"When":[167],"constructing":[168],"kernels,":[170],"exploits":[174],"side":[175,210],"effects":[176,211],"introduced":[177,206],"induction":[179],"variable":[180],"based":[181,185],"code":[182,202],"optimization":[183],"techniques":[184],"loop":[187],"unrolling":[188],"strength":[190],"reduction":[191],"improve":[193],"its":[194],"capability.":[196],"To":[197],"end,":[199],"two":[200],"new":[201],"transformation":[203],"passes":[204],"expose":[208],"purposes.":[214],"We":[215,275],"evaluated":[216,277],"with":[220,272,281,329],"4":[221],"scientific":[222],"workloads":[223],"well":[225],"NPB":[228],"benchmarks":[229],"suite.":[230],"normal":[233],"execution,":[234],"incurs":[238],"almost":[239,304],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">zero</b>":[242],"runtime":[243],"overhead":[244],"small,":[247],"fixed":[248],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">27MB</b>":[251],"memory":[252],"overhead.":[253],"Meanwhile,":[254],"can":[258,294],"an":[261,343],"average":[262],"83.55":[263],"percent":[264,337],"crash-causing":[266,300],"within":[268],"dozens":[269],"milliseconds":[271],"negligible":[273,339],"parallel":[282],"jobs":[283],"3072":[286],"cores":[287],"showed":[289],"successfully":[295],"mask":[296],"impact":[298],"providing":[303],"uninterrupted":[305],"execution.":[306],"Finally,":[307],"our":[310],"preliminary":[311],"evaluation":[312],"result":[313],"BLAS,":[315],"shows":[317],"is":[322],"capable":[323],"recovering":[325],"failures":[326],"in":[327,362],"libraries":[328],"high":[332],"coverage":[333],"rate":[334],"83":[336],"overheads.":[340],"With":[341],"effective":[344],"mechanism,":[346],"could":[350],"tremendously":[351],"mitigate":[352],"overheads":[354],"resource":[356],"requirements":[357],"subsystem":[361],"future":[363]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
