{"id":"https://openalex.org/W2074787299","doi":"https://doi.org/10.1007/s11227-013-0884-0","title":"A survey of fault tolerance mechanisms and checkpoint/restart implementations for high performance computing systems","display_name":"A survey of fault tolerance mechanisms and checkpoint/restart implementations for high performance computing systems","publication_year":2013,"publication_date":"2013-02-11","ids":{"openalex":"https://openalex.org/W2074787299","doi":"https://doi.org/10.1007/s11227-013-0884-0","mag":"2074787299"},"language":"en","primary_location":{"id":"doi:10.1007/s11227-013-0884-0","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11227-013-0884-0","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11227-013-0884-0.pdf","source":{"id":"https://openalex.org/S32326811","display_name":"The Journal of Supercomputing","issn_l":"0920-8542","issn":["0920-8542","1573-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Journal of Supercomputing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s11227-013-0884-0.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063909341","display_name":"Ifeanyi P. Egwutuoha","orcid":null},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Ifeanyi P. Egwutuoha","raw_affiliation_strings":["School of Electrical & Information Engineering, The University of Sydney, Sydney, NSW, 2006, Australia","School of Electrical and Information Engineering, The University of Sydney, Sydney, Australia 2006"],"affiliations":[{"raw_affiliation_string":"School of Electrical & Information Engineering, The University of Sydney, Sydney, NSW, 2006, Australia","institution_ids":["https://openalex.org/I129604602"]},{"raw_affiliation_string":"School of Electrical and Information Engineering, The University of Sydney, Sydney, Australia 2006","institution_ids":["https://openalex.org/I129604602"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062521849","display_name":"David Levy","orcid":"https://orcid.org/0000-0002-2490-8318"},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"David Levy","raw_affiliation_strings":["School of Electrical & Information Engineering, The University of Sydney, Sydney, NSW, 2006, Australia","School of Electrical and Information Engineering, The University of Sydney, Sydney, Australia 2006"],"affiliations":[{"raw_affiliation_string":"School of Electrical & Information Engineering, The University of Sydney, Sydney, NSW, 2006, Australia","institution_ids":["https://openalex.org/I129604602"]},{"raw_affiliation_string":"School of Electrical and Information Engineering, The University of Sydney, Sydney, Australia 2006","institution_ids":["https://openalex.org/I129604602"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052960440","display_name":"Bran Seli\u0107","orcid":"https://orcid.org/0000-0002-3703-8593"},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Bran Selic","raw_affiliation_strings":["School of Electrical & Information Engineering, The University of Sydney, Sydney, NSW, 2006, Australia","School of Electrical and Information Engineering, The University of Sydney, Sydney, Australia 2006"],"affiliations":[{"raw_affiliation_string":"School of Electrical & Information Engineering, The University of Sydney, Sydney, NSW, 2006, Australia","institution_ids":["https://openalex.org/I129604602"]},{"raw_affiliation_string":"School of Electrical and Information Engineering, The University of Sydney, Sydney, Australia 2006","institution_ids":["https://openalex.org/I129604602"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043470959","display_name":"Shiping Chen","orcid":"https://orcid.org/0000-0002-4603-0024"},"institutions":[{"id":"https://openalex.org/I4210128581","display_name":"Information and Communication Technologies Centre","ror":"https://ror.org/034x2fx50","country_code":"AU","type":"facility","lineage":["https://openalex.org/I1292875679","https://openalex.org/I2801453606","https://openalex.org/I4210128581","https://openalex.org/I4387156119"]},{"id":"https://openalex.org/I1292875679","display_name":"Commonwealth Scientific and Industrial Research Organisation","ror":"https://ror.org/03qn8fb07","country_code":"AU","type":"funder","lineage":["https://openalex.org/I1292875679","https://openalex.org/I2801453606","https://openalex.org/I4387156119"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Shiping Chen","raw_affiliation_strings":["Information Engineering Laboratory, CSIRO ICT Centre, Sydney, Australia","Information Engineering Laboratory, CSIRO ICT Centre, Sydney, Australia#TAB#"],"affiliations":[{"raw_affiliation_string":"Information Engineering Laboratory, CSIRO ICT Centre, Sydney, Australia","institution_ids":["https://openalex.org/I1292875679","https://openalex.org/I4210128581"]},{"raw_affiliation_string":"Information Engineering Laboratory, CSIRO ICT Centre, Sydney, Australia#TAB#","institution_ids":["https://openalex.org/I1292875679"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5063909341"],"corresponding_institution_ids":["https://openalex.org/I129604602"],"apc_list":{"value":2390,"currency":"EUR","value_usd":2990},"apc_paid":{"value":2390,"currency":"EUR","value_usd":2990},"fwci":26.0969,"has_fulltext":true,"cited_by_count":248,"citation_normalized_percentile":{"value":0.99649809,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"65","issue":"3","first_page":"1302","last_page":"1326"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.878854513168335},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.7982277870178223},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.7507637739181519},{"id":"https://openalex.org/keywords/massively-parallel","display_name":"Massively parallel","score":0.5968949198722839},{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.5841490626335144},{"id":"https://openalex.org/keywords/rollback","display_name":"Rollback","score":0.5679593086242676},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.50722336769104},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.4668021500110626},{"id":"https://openalex.org/keywords/high-availability","display_name":"High availability","score":0.4388032853603363},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.39190176129341125},{"id":"https://openalex.org/keywords/software-engineering","display_name":"Software engineering","score":0.15464237332344055},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.12192636728286743}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.878854513168335},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.7982277870178223},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.7507637739181519},{"id":"https://openalex.org/C190475519","wikidata":"https://www.wikidata.org/wiki/Q544384","display_name":"Massively parallel","level":2,"score":0.5968949198722839},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.5841490626335144},{"id":"https://openalex.org/C174220543","wikidata":"https://www.wikidata.org/wiki/Q395307","display_name":"Rollback","level":3,"score":0.5679593086242676},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.50722336769104},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.4668021500110626},{"id":"https://openalex.org/C65813073","wikidata":"https://www.wikidata.org/wiki/Q1622420","display_name":"High availability","level":2,"score":0.4388032853603363},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.39190176129341125},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.15464237332344055},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.12192636728286743},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C75949130","wikidata":"https://www.wikidata.org/wiki/Q848010","display_name":"Database transaction","level":2,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s11227-013-0884-0","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11227-013-0884-0","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11227-013-0884-0.pdf","source":{"id":"https://openalex.org/S32326811","display_name":"The Journal of Supercomputing","issn_l":"0920-8542","issn":["0920-8542","1573-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Journal of Supercomputing","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s11227-013-0884-0","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11227-013-0884-0","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11227-013-0884-0.pdf","source":{"id":"https://openalex.org/S32326811","display_name":"The Journal of Supercomputing","issn_l":"0920-8542","issn":["0920-8542","1573-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Journal of Supercomputing","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.49000000953674316,"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2074787299.pdf","grobid_xml":"https://content.openalex.org/works/W2074787299.grobid-xml"},"referenced_works_count":83,"referenced_works":["https://openalex.org/W28511425","https://openalex.org/W141121412","https://openalex.org/W256832651","https://openalex.org/W273624168","https://openalex.org/W560655873","https://openalex.org/W1482022028","https://openalex.org/W1505812833","https://openalex.org/W1518814527","https://openalex.org/W1520339130","https://openalex.org/W1524357507","https://openalex.org/W1533813939","https://openalex.org/W1537929875","https://openalex.org/W1544837488","https://openalex.org/W1555373975","https://openalex.org/W1575350781","https://openalex.org/W1576341769","https://openalex.org/W1577147036","https://openalex.org/W1631691693","https://openalex.org/W1668777023","https://openalex.org/W1837653564","https://openalex.org/W1945100066","https://openalex.org/W1987353583","https://openalex.org/W1997021720","https://openalex.org/W1997269120","https://openalex.org/W1999667809","https://openalex.org/W2001495258","https://openalex.org/W2004959907","https://openalex.org/W2016737685","https://openalex.org/W2028133900","https://openalex.org/W2036641664","https://openalex.org/W2042851811","https://openalex.org/W2043522772","https://openalex.org/W2045879521","https://openalex.org/W2047317675","https://openalex.org/W2048894106","https://openalex.org/W2051642332","https://openalex.org/W2086815197","https://openalex.org/W2089536264","https://openalex.org/W2097719961","https://openalex.org/W2097833816","https://openalex.org/W2100970777","https://openalex.org/W2105039796","https://openalex.org/W2107263349","https://openalex.org/W2112510389","https://openalex.org/W2118706534","https://openalex.org/W2118926411","https://openalex.org/W2120185818","https://openalex.org/W2126969927","https://openalex.org/W2131053137","https://openalex.org/W2133201251","https://openalex.org/W2133553843","https://openalex.org/W2134054737","https://openalex.org/W2137905605","https://openalex.org/W2139244298","https://openalex.org/W2142395340","https://openalex.org/W2142812297","https://openalex.org/W2145982220","https://openalex.org/W2149719295","https://openalex.org/W2152893862","https://openalex.org/W2157762234","https://openalex.org/W2157801087","https://openalex.org/W2158907675","https://openalex.org/W2171453084","https://openalex.org/W2294047952","https://openalex.org/W2481033286","https://openalex.org/W2600844214","https://openalex.org/W2911646362","https://openalex.org/W2914982603","https://openalex.org/W3035757797","https://openalex.org/W3137220996","https://openalex.org/W3141978889","https://openalex.org/W3152077726","https://openalex.org/W4210749408","https://openalex.org/W4231410376","https://openalex.org/W4233061857","https://openalex.org/W4235444176","https://openalex.org/W4245145882","https://openalex.org/W4246275617","https://openalex.org/W4248325045","https://openalex.org/W4249873503","https://openalex.org/W4251708180","https://openalex.org/W4252273460","https://openalex.org/W4302440668"],"related_works":["https://openalex.org/W2073684863","https://openalex.org/W2103295733","https://openalex.org/W2363040373","https://openalex.org/W2808531585","https://openalex.org/W1532689837","https://openalex.org/W1538240937","https://openalex.org/W2927703147","https://openalex.org/W4233801908","https://openalex.org/W2204165862","https://openalex.org/W2512132837"],"abstract_inverted_index":{"In":[0,41],"recent":[1],"years,":[2],"High":[3],"Performance":[4],"Computing":[5],"(HPC)":[6],"systems":[7,33,52,62],"have":[8],"been":[9],"shifting":[10],"from":[11],"expensive":[12],"massively":[13],"parallel":[14],"architectures":[15],"to":[16,21,119,128],"clusters":[17,80],"of":[18,24,50,98,115,131],"commodity":[19],"PCs":[20],"take":[22],"advantage":[23],"cost":[25],"and":[26,53,63,102],"performance":[27],"benefits.":[28],"Fault":[29],"tolerance":[30,58],"in":[31,122],"such":[32],"is":[34,105,118],"a":[35,103],"growing":[36],"concern":[37],"for":[38,60,75,88,107],"long-running":[39,76,89],"applications.":[40],"this":[42,116],"paper,":[43],"we":[44],"briefly":[45],"review":[46],"the":[47,56,95,123],"failure":[48],"rates":[49],"HPC":[51,61,79,92],"also":[54],"survey":[55],"fault":[57],"approaches":[59],"issues":[64],"with":[65],"these":[66],"approaches.":[67],"Rollback-recovery":[68],"techniques":[69],"which":[70],"are":[71,81,85,100],"most":[72],"often":[73],"used":[74,87],"applications":[77,90],"on":[78,91],"discussed":[82,101],"because":[83],"they":[84],"widely":[86],"systems.":[93],"Specifically,":[94],"feature":[96],"requirements":[97],"rollback-recovery":[99],"taxonomy":[104],"developed":[106],"over":[108],"twenty":[109],"popular":[110],"checkpoint/restart":[111],"solutions.":[112,134],"The":[113],"intent":[114],"paper":[117],"aid":[120],"researchers":[121],"domain":[124],"as":[125,127],"well":[126],"facilitate":[129],"development":[130],"new":[132],"checkpointing":[133]},"counts_by_year":[{"year":2025,"cited_by_count":13},{"year":2024,"cited_by_count":9},{"year":2023,"cited_by_count":12},{"year":2022,"cited_by_count":10},{"year":2021,"cited_by_count":19},{"year":2020,"cited_by_count":21},{"year":2019,"cited_by_count":17},{"year":2018,"cited_by_count":39},{"year":2017,"cited_by_count":36},{"year":2016,"cited_by_count":29},{"year":2015,"cited_by_count":27},{"year":2014,"cited_by_count":14},{"year":2013,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
