{"id":"https://openalex.org/W3037298866","doi":"https://doi.org/10.1145/3369583.3392672","title":"Orchestrating Fault Prediction with Live Migration and Checkpointing","display_name":"Orchestrating Fault Prediction with Live Migration and Checkpointing","publication_year":2020,"publication_date":"2020-06-22","ids":{"openalex":"https://openalex.org/W3037298866","doi":"https://doi.org/10.1145/3369583.3392672","mag":"3037298866"},"language":"en","primary_location":{"id":"doi:10.1145/3369583.3392672","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3369583.3392672","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th International Symposium on High-Performance Parallel and Distributed Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064446614","display_name":"Subhendu Behera","orcid":null},"institutions":[{"id":"https://openalex.org/I137902535","display_name":"North Carolina State University","ror":"https://ror.org/04tj63d06","country_code":"US","type":"education","lineage":["https://openalex.org/I137902535"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Subhendu Behera","raw_affiliation_strings":["North Carolina State University, Raleigh, NC, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"North Carolina State University, Raleigh, NC, USA","institution_ids":["https://openalex.org/I137902535"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101790449","display_name":"Lipeng Wan","orcid":"https://orcid.org/0000-0003-2347-8667"},"institutions":[{"id":"https://openalex.org/I1289243028","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56","country_code":"US","type":"facility","lineage":["https://openalex.org/I1289243028","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I4210159294"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lipeng Wan","raw_affiliation_strings":["Oak Ridge National Laboratory, Oak Ridge, TN, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Oak Ridge National Laboratory, Oak Ridge, TN, USA","institution_ids":["https://openalex.org/I1289243028"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031749427","display_name":"Frank Mueller","orcid":"https://orcid.org/0000-0002-0258-0294"},"institutions":[{"id":"https://openalex.org/I137902535","display_name":"North Carolina State University","ror":"https://ror.org/04tj63d06","country_code":"US","type":"education","lineage":["https://openalex.org/I137902535"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Frank Mueller","raw_affiliation_strings":["North Carolina State University, Raleigh, NC, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"North Carolina State University, Raleigh, NC, USA","institution_ids":["https://openalex.org/I137902535"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015973975","display_name":"Matthew Wolf","orcid":"https://orcid.org/0000-0002-8393-4436"},"institutions":[{"id":"https://openalex.org/I1289243028","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56","country_code":"US","type":"facility","lineage":["https://openalex.org/I1289243028","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I4210159294"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Matthew Wolf","raw_affiliation_strings":["Oak Ridge National Laboratory, Oak Ridge, TN, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Oak Ridge National Laboratory, Oak Ridge, TN, USA","institution_ids":["https://openalex.org/I1289243028"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5030373337","display_name":"Scott Klasky","orcid":"https://orcid.org/0000-0003-3559-5772"},"institutions":[{"id":"https://openalex.org/I1289243028","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56","country_code":"US","type":"facility","lineage":["https://openalex.org/I1289243028","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I4210159294"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Scott Klasky","raw_affiliation_strings":["Oak Ridge National Laboratory, Oak Ridge, TN, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Oak Ridge National Laboratory, Oak Ridge, TN, USA","institution_ids":["https://openalex.org/I1289243028"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.1345,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.80076885,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"167","last_page":"171"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.7683139443397522},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7575005292892456},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.7269152402877808},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.6892310380935669},{"id":"https://openalex.org/keywords/summit","display_name":"Summit","score":0.5447285175323486},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.45616111159324646},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4296827018260956},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.4032072126865387},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.39050376415252686}],"concepts":[{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.7683139443397522},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7575005292892456},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.7269152402877808},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.6892310380935669},{"id":"https://openalex.org/C2778848561","wikidata":"https://www.wikidata.org/wiki/Q207326","display_name":"Summit","level":2,"score":0.5447285175323486},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.45616111159324646},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4296827018260956},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.4032072126865387},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.39050376415252686},{"id":"https://openalex.org/C100970517","wikidata":"https://www.wikidata.org/wiki/Q52107","display_name":"Physical geography","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3369583.3392672","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3369583.3392672","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th International Symposium on High-Performance Parallel and Distributed Computing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.7599999904632568}],"awards":[{"id":"https://openalex.org/G2501833379","display_name":null,"funder_award_id":"1525609, 1813004","funder_id":"https://openalex.org/F4320315254","funder_display_name":"Innovative Research Group Project of the National Natural Science Foundation of China"},{"id":"https://openalex.org/G3652629119","display_name":null,"funder_award_id":"DE-AC05-00OR22725, Exascale Computing Project 17-SC-20-SC","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"}],"funders":[{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320315254","display_name":"Innovative Research Group Project of the National Natural Science Foundation of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W169659540","https://openalex.org/W1596936080","https://openalex.org/W1974534679","https://openalex.org/W1981432246","https://openalex.org/W1984564341","https://openalex.org/W2023324599","https://openalex.org/W2031080073","https://openalex.org/W2033656974","https://openalex.org/W2039631162","https://openalex.org/W2042506452","https://openalex.org/W2063793192","https://openalex.org/W2089536264","https://openalex.org/W2095157729","https://openalex.org/W2102576651","https://openalex.org/W2112748121","https://openalex.org/W2136797898","https://openalex.org/W2137512117","https://openalex.org/W2142812297","https://openalex.org/W2167563208","https://openalex.org/W2292469857","https://openalex.org/W2320681231","https://openalex.org/W2539230224","https://openalex.org/W2567534547","https://openalex.org/W2623640553","https://openalex.org/W2734985949","https://openalex.org/W2808242862","https://openalex.org/W2883852409","https://openalex.org/W2902041647","https://openalex.org/W2902896512","https://openalex.org/W2913367091","https://openalex.org/W2961595369","https://openalex.org/W2983819714","https://openalex.org/W3015731845","https://openalex.org/W3080528907","https://openalex.org/W4288231122"],"related_works":["https://openalex.org/W1982074779","https://openalex.org/W3203904831","https://openalex.org/W4241376597","https://openalex.org/W4230129158","https://openalex.org/W630050097","https://openalex.org/W1599154403","https://openalex.org/W1862835629","https://openalex.org/W2136799148","https://openalex.org/W2897533804","https://openalex.org/W2890506991"],"abstract_inverted_index":{"Checkpoint/Restart":[0],"(C/R)":[1],"is":[2],"widely":[3],"used":[4],"to":[5,69,89,105],"provide":[6],"fault":[7],"tolerance":[8],"on":[9],"High-Performance":[10],"Computing":[11],"(HPC)":[12],"systems.":[13],"However,":[14],"Parallel":[15],"File":[16],"System":[17],"(PFS)":[18],"overhead":[19,87],"and":[20,40,50,72,94],"failure":[21,38,45,92],"uncertainty":[22],"cause":[23],"significant":[24],"application":[25,86],"overhead.":[26],"This":[27],"paper":[28],"develops":[29],"an":[30],"adaptive":[31],"multi-level":[32],"C/R":[33],"model":[34],"that":[35],"incorporates":[36],"a":[37,81,99],"prediction":[39],"analysis":[41],"model,":[42],"which":[43,107],"orchestrates":[44],"prediction,":[46,93],"checkpointing,":[47,71],"checkpoint":[48,103],"frequency,":[49],"proactive":[51],"live":[52],"migration":[53],"along":[54],"with":[55],"the":[56,66,77,110,113],"additional":[57],"benefit":[58],"of":[59,83,112],"Burst":[60],"Buffers":[61],"(BB).":[62],"It":[63],"effectively":[64],"reduces":[65],"overheads":[67],"due":[68,88],"failures,":[70],"recovery.":[73],"Simulation":[74],"results":[75],"for":[76],"Summit":[78],"supercomputer":[79],"yield":[80],"reduction":[82],"~20%-86%":[84],"in":[85,102],"BBs,":[90,106],"orchestrated":[91],"migration.":[95],"We":[96],"also":[97],"observe":[98],"~29%":[100],"decrease":[101],"writes":[104],"can":[108],"increase":[109],"longevity":[111],"BB":[114],"storage":[115],"devices.":[116]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":4}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
