{"id":"https://openalex.org/W2024966997","doi":"https://doi.org/10.1109/hipc.2011.6152716","title":"Building algorithmically nonstop fault tolerant MPI programs","display_name":"Building algorithmically nonstop fault tolerant MPI programs","publication_year":2011,"publication_date":"2011-12-01","ids":{"openalex":"https://openalex.org/W2024966997","doi":"https://doi.org/10.1109/hipc.2011.6152716","mag":"2024966997"},"language":"en","primary_location":{"id":"doi:10.1109/hipc.2011.6152716","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hipc.2011.6152716","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2011 18th International Conference on High Performance Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100431257","display_name":"Rui Wang","orcid":"https://orcid.org/0009-0003-8935-3119"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Rui Wang","raw_affiliation_strings":["State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, China","State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, ,"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]},{"raw_affiliation_string":"State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, ,","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020595518","display_name":"Erlin Yao","orcid":"https://orcid.org/0000-0002-0130-0035"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Erlin Yao","raw_affiliation_strings":["State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, China","State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, ,"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]},{"raw_affiliation_string":"State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, ,","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052120417","display_name":"Mingyu Chen","orcid":"https://orcid.org/0000-0002-7264-0731"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mingyu Chen","raw_affiliation_strings":["State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, China","State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, ,"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]},{"raw_affiliation_string":"State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, ,","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104072170","display_name":"Guangming Tan","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangming Tan","raw_affiliation_strings":["State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, China","State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, ,"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]},{"raw_affiliation_string":"State Key Laboratory of Computer Architecture, Institute of Computing Technology, Chinese Academy of Sciences, ,","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050531633","display_name":"Pavan Balaji","orcid":null},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Pavan Balaji","raw_affiliation_strings":["Mathematics and Computer Science, Argonne National Laboratory, USA","Mathematics and Computer Science, Argonne National Laboratory"],"affiliations":[{"raw_affiliation_string":"Mathematics and Computer Science, Argonne National Laboratory, USA","institution_ids":["https://openalex.org/I1282105669"]},{"raw_affiliation_string":"Mathematics and Computer Science, Argonne National Laboratory","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5034514540","display_name":"Darius Buntinas","orcid":null},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Darius Buntinas","raw_affiliation_strings":["Mathematics and Computer Science, Argonne National Laboratory, USA","Mathematics and Computer Science, Argonne National Laboratory"],"affiliations":[{"raw_affiliation_string":"Mathematics and Computer Science, Argonne National Laboratory, USA","institution_ids":["https://openalex.org/I1282105669"]},{"raw_affiliation_string":"Mathematics and Computer Science, Argonne National Laboratory","institution_ids":["https://openalex.org/I1282105669"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100431257"],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210090176"],"apc_list":null,"apc_paid":null,"fwci":1.7504,"has_fulltext":false,"cited_by_count":18,"citation_normalized_percentile":{"value":0.84983689,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":"3","issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/nonstop","display_name":"NonStop","score":0.9272042512893677},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8634096384048462},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.8463121056556702},{"id":"https://openalex.org/keywords/message-passing-interface","display_name":"Message Passing Interface","score":0.6620696783065796},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.604313850402832},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.5921542048454285},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4751546084880829},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.45215484499931335},{"id":"https://openalex.org/keywords/message-passing","display_name":"Message passing","score":0.39868032932281494},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.3644030690193176}],"concepts":[{"id":"https://openalex.org/C2779013668","wikidata":"https://www.wikidata.org/wiki/Q826798","display_name":"NonStop","level":2,"score":0.9272042512893677},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8634096384048462},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.8463121056556702},{"id":"https://openalex.org/C166782233","wikidata":"https://www.wikidata.org/wiki/Q127879","display_name":"Message Passing Interface","level":3,"score":0.6620696783065796},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.604313850402832},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.5921542048454285},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4751546084880829},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.45215484499931335},{"id":"https://openalex.org/C854659","wikidata":"https://www.wikidata.org/wiki/Q1859284","display_name":"Message passing","level":2,"score":0.39868032932281494},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3644030690193176},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hipc.2011.6152716","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hipc.2011.6152716","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2011 18th International Conference on High Performance Computing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.44999998807907104,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W1576341769","https://openalex.org/W2037523067","https://openalex.org/W2045879521","https://openalex.org/W2054129151","https://openalex.org/W2082659044","https://openalex.org/W2083613288","https://openalex.org/W2089536264","https://openalex.org/W2097946733","https://openalex.org/W2102844000","https://openalex.org/W2126163873","https://openalex.org/W2133046454","https://openalex.org/W2138166684","https://openalex.org/W2150679822","https://openalex.org/W2151272421","https://openalex.org/W2151984682","https://openalex.org/W2155662278","https://openalex.org/W2158344138","https://openalex.org/W2296772319","https://openalex.org/W3198669718","https://openalex.org/W6662107652","https://openalex.org/W6682626600"],"related_works":["https://openalex.org/W2124048060","https://openalex.org/W1902983110","https://openalex.org/W2378910916","https://openalex.org/W4386915331","https://openalex.org/W2086666199","https://openalex.org/W1511717675","https://openalex.org/W2116006827","https://openalex.org/W2361929291","https://openalex.org/W2024966997","https://openalex.org/W2082659044"],"abstract_inverted_index":{"With":[0],"the":[1,29,39,48,51,62,69,77,97,108,119,122,130,136,139,143,159,166,177,199,212],"growing":[2],"scale":[3,67],"of":[4,50,72,110,121,141,165,180,201,214],"high-performance":[5],"computing":[6],"(HPC)":[7],"systems,":[8],"today":[9],"and":[10,45,100,134,174,197,206],"more":[11],"so":[12],"tomorrow,":[13],"faults":[14],"are":[15],"a":[16,57,91,151,185,223],"norm":[17],"rather":[18],"than":[19],"an":[20],"exception.":[21],"HPC":[22,73],"applications":[23],"typically":[24],"tolerate":[25],"fail-stop":[26],"failures":[27],"under":[28],"stop-and-wait":[30,63,79],"scheme,":[31,160],"where":[32],"even":[33,221],"if":[34],"only":[35],"one":[36],"processor":[37],"fails,":[38],"whole":[40],"system":[41],"has":[42],"to":[43,68,116,157],"stop":[44,115],"wait":[46,117],"for":[47,118],"recovery":[49,84,120,220],"corrupted":[52,123],"data.":[53],"It":[54],"is":[55],"now":[56],"more-or-less":[58],"accepted":[59],"fact":[60],"that":[61],"scheme":[64,95,205,217],"will":[65],"not":[66,114],"next":[70],"generation":[71],"systems.":[74],"Inspired":[75],"by":[76],"previous":[78],"algorithm-based":[80],"fault":[81,93],"tolerance":[82,94],"(ABFT)":[83],"technique,":[85],"we":[86,112,126],"propose":[87],"in":[88,176,222],"this":[89],"paper":[90],"nonstop":[92],"at":[96,150],"application":[98],"level":[99],"describe":[101,184],"its":[102],"implementation.":[103],"When":[104],"failure":[105],"occurs":[106],"during":[107],"execution":[109],"applications,":[111],"do":[113],"node;":[124],"instead,":[125],"replace":[127],"it":[128],"with":[129,193],"corresponding":[131],"redundant":[132],"node":[133],"continue":[135],"execution.":[137],"At":[138],"end":[140],"execution,":[142],"correct":[144],"solution":[145],"can":[146],"be":[147],"recovered":[148],"algorithmically":[149],"very":[152],"low":[153],"cost.":[154],"In":[155],"order":[156],"implement":[158],"some":[161],"new":[162,195,204,216],"fault-tolerant":[163],"features":[164,196],"Message":[167],"Passing":[168],"Interface":[169],"(MPI)":[170],"have":[171],"been":[172],"investigated":[173],"utilized":[175],"MPICH":[178],"implementation":[179],"MPI.":[181],"We":[182],"also":[183],"case":[186],"study":[187],"using":[188],"High":[189],"Performance":[190],"Linpack":[191],"(HPL)":[192],"these":[194],"evaluate":[198],"performance":[200],"both":[202],"our":[203,215],"ABFT":[207,219],"recovery.":[208],"Experimental":[209],"results":[210],"show":[211],"advantage":[213],"over":[218],"small":[224],"scale.":[225]},"counts_by_year":[{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":2},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":3},{"year":2016,"cited_by_count":3},{"year":2015,"cited_by_count":2},{"year":2014,"cited_by_count":1},{"year":2013,"cited_by_count":2},{"year":2012,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
