{"id":"https://openalex.org/W2899455601","doi":"https://doi.org/10.1109/cluster.2018.00067","title":"A Failure Prediction-Based Adaptive Checkpointing Method with Less Reliance on Temperature Monitoring for HPC Applications","display_name":"A Failure Prediction-Based Adaptive Checkpointing Method with Less Reliance on Temperature Monitoring for HPC Applications","publication_year":2018,"publication_date":"2018-09-01","ids":{"openalex":"https://openalex.org/W2899455601","doi":"https://doi.org/10.1109/cluster.2018.00067","mag":"2899455601"},"language":"en","primary_location":{"id":"doi:10.1109/cluster.2018.00067","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cluster.2018.00067","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE International Conference on Cluster Computing (CLUSTER)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088329190","display_name":"Muhammad Alfian Amrizal","orcid":"https://orcid.org/0000-0003-1124-5137"},"institutions":[{"id":"https://openalex.org/I201537933","display_name":"Tohoku University","ror":"https://ror.org/01dq60k83","country_code":"JP","type":"education","lineage":["https://openalex.org/I201537933"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Muhammad Alfian Amrizal","raw_affiliation_strings":["Research Institute of Electrical Communication, Tohoku University, Sendai, Miyagi, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Research Institute of Electrical Communication, Tohoku University, Sendai, Miyagi, Japan","institution_ids":["https://openalex.org/I201537933"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017847162","display_name":"Pei Li","orcid":"https://orcid.org/0000-0003-1976-5241"},"institutions":[{"id":"https://openalex.org/I201537933","display_name":"Tohoku University","ror":"https://ror.org/01dq60k83","country_code":"JP","type":"education","lineage":["https://openalex.org/I201537933"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Pei Li","raw_affiliation_strings":["Graduate School of information Sciences, Tohoku University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Graduate School of information Sciences, Tohoku University","institution_ids":["https://openalex.org/I201537933"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073353406","display_name":"Mulya Agung","orcid":"https://orcid.org/0000-0001-9521-2177"},"institutions":[{"id":"https://openalex.org/I201537933","display_name":"Tohoku University","ror":"https://ror.org/01dq60k83","country_code":"JP","type":"education","lineage":["https://openalex.org/I201537933"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Mulya Agung","raw_affiliation_strings":["Graduate School of information Sciences, Tohoku University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Graduate School of information Sciences, Tohoku University","institution_ids":["https://openalex.org/I201537933"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033954091","display_name":"Ryusuke Egawa","orcid":"https://orcid.org/0000-0001-8966-867X"},"institutions":[{"id":"https://openalex.org/I201537933","display_name":"Tohoku University","ror":"https://ror.org/01dq60k83","country_code":"JP","type":"education","lineage":["https://openalex.org/I201537933"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Ryusuke Egawa","raw_affiliation_strings":["Graduate School of information Sciences, Tohoku University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Graduate School of information Sciences, Tohoku University","institution_ids":["https://openalex.org/I201537933"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5011172934","display_name":"Hiroyuki Takizawa","orcid":"https://orcid.org/0000-0003-2858-3140"},"institutions":[{"id":"https://openalex.org/I4210093896","display_name":"Tohoku University Hospital","ror":"https://ror.org/00kcd6x60","country_code":"JP","type":"healthcare","lineage":["https://openalex.org/I4210093896"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hiroyuki Takizawa","raw_affiliation_strings":["Cyberscience Center, Tohoku University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Cyberscience Center, Tohoku University","institution_ids":["https://openalex.org/I4210093896"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.1976,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.59024734,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":"162","issue":null,"first_page":"515","last_page":"523"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.989799976348877,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9884999990463257,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mean-time-between-failures","display_name":"Mean time between failures","score":0.8628830909729004},{"id":"https://openalex.org/keywords/weibull-distribution","display_name":"Weibull distribution","score":0.7727638483047485},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7637304067611694},{"id":"https://openalex.org/keywords/interval","display_name":"Interval (graph theory)","score":0.7452061176300049},{"id":"https://openalex.org/keywords/constant","display_name":"Constant (computer programming)","score":0.6946353316307068},{"id":"https://openalex.org/keywords/exponential-distribution","display_name":"Exponential distribution","score":0.6036471128463745},{"id":"https://openalex.org/keywords/failure-rate","display_name":"Failure rate","score":0.5856640934944153},{"id":"https://openalex.org/keywords/exponential-function","display_name":"Exponential function","score":0.44828617572784424},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.4371792674064636},{"id":"https://openalex.org/keywords/real-time-computing","display_name":"Real-time computing","score":0.3456646203994751},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.330067902803421},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.32308533787727356},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.3017376661300659},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.11387786269187927},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.0941796600818634}],"concepts":[{"id":"https://openalex.org/C44154001","wikidata":"https://www.wikidata.org/wiki/Q754940","display_name":"Mean time between failures","level":3,"score":0.8628830909729004},{"id":"https://openalex.org/C173291955","wikidata":"https://www.wikidata.org/wiki/Q732332","display_name":"Weibull distribution","level":2,"score":0.7727638483047485},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7637304067611694},{"id":"https://openalex.org/C2778067643","wikidata":"https://www.wikidata.org/wiki/Q166507","display_name":"Interval (graph theory)","level":2,"score":0.7452061176300049},{"id":"https://openalex.org/C2777027219","wikidata":"https://www.wikidata.org/wiki/Q1284190","display_name":"Constant (computer programming)","level":2,"score":0.6946353316307068},{"id":"https://openalex.org/C55350006","wikidata":"https://www.wikidata.org/wiki/Q237193","display_name":"Exponential distribution","level":2,"score":0.6036471128463745},{"id":"https://openalex.org/C163164238","wikidata":"https://www.wikidata.org/wiki/Q2737027","display_name":"Failure rate","level":2,"score":0.5856640934944153},{"id":"https://openalex.org/C151376022","wikidata":"https://www.wikidata.org/wiki/Q168698","display_name":"Exponential function","level":2,"score":0.44828617572784424},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.4371792674064636},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3456646203994751},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.330067902803421},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.32308533787727356},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3017376661300659},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.11387786269187927},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0941796600818634},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cluster.2018.00067","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cluster.2018.00067","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE International Conference on Cluster Computing (CLUSTER)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W190397983","https://openalex.org/W1578105119","https://openalex.org/W1966243865","https://openalex.org/W1979594720","https://openalex.org/W1993660990","https://openalex.org/W1998221613","https://openalex.org/W1999900893","https://openalex.org/W2033656974","https://openalex.org/W2039631162","https://openalex.org/W2043439509","https://openalex.org/W2060573894","https://openalex.org/W2101221989","https://openalex.org/W2106119405","https://openalex.org/W2112082363","https://openalex.org/W2113273853","https://openalex.org/W2121575774","https://openalex.org/W2127433432","https://openalex.org/W2131053137","https://openalex.org/W2132914442","https://openalex.org/W2133046454","https://openalex.org/W2142812297","https://openalex.org/W2160821994","https://openalex.org/W2318507312","https://openalex.org/W2528110265","https://openalex.org/W2604753019","https://openalex.org/W2774682496","https://openalex.org/W2997622231","https://openalex.org/W3111163196","https://openalex.org/W4229938669","https://openalex.org/W4233278384","https://openalex.org/W4242361160","https://openalex.org/W4285719527","https://openalex.org/W6665528376","https://openalex.org/W6677077650","https://openalex.org/W6786747289"],"related_works":["https://openalex.org/W3097728737","https://openalex.org/W2382998060","https://openalex.org/W2612366884","https://openalex.org/W2360037012","https://openalex.org/W2053233382","https://openalex.org/W2271281602","https://openalex.org/W2034080945","https://openalex.org/W2337334590","https://openalex.org/W2782720998","https://openalex.org/W1034752577"],"abstract_inverted_index":{"Checkpointing":[0],"with":[1,105,151,225],"a":[2,6,45,64,89,102,201,226],"constant":[3,8,92,222],"checkpoint":[4,108],"interval,":[5,109],"so-called":[7],"checkpointing":[9,93,103,113,149,223],"method,":[10,114,126],"is":[11,44,73,95,115,194],"commonly":[12],"used":[13],"in":[14,134],"HPC":[15],"field":[16],"and":[17,51,76,100],"has":[18],"been":[19],"proved":[20],"to":[21,117,122,136,172,177,220],"be":[22,131],"the":[23,35,57,60,70,77,91,97,124,127,138,155,162,169,174,181,185,191,198,210,215,221],"optimal":[24,98],"solution":[25,99],"for":[26,140],"failures":[27],"whose":[28],"inter-arrival":[29,79],"times":[30,80],"are":[31],"distributed":[32],"exponentially.":[33],"On":[34],"other":[36],"hand,":[37],"previous":[38],"works":[39],"have":[40],"shown":[41],"that":[42,69,209],"there":[43],"high":[46,119],"correlation":[47],"between":[48],"processor":[49,128],"temperature":[50,61,129,156,229],"its":[52],"failure":[53,71,78,178,193],"rate.":[54],"By":[55],"analyzing":[56],"results":[58,207],"of":[59,164,180,190,200],"monitoring":[62,230],"on":[63,154,197],"parallel":[65],"application,":[66],"we":[67,145],"noticed":[68],"rate":[72],"dynamically":[74],"changing":[75],"do":[81],"not":[82,96],"follow":[83],"an":[84,106,111,147],"exponential":[85],"distribution.":[86,204],"Under":[87],"such":[88],"scenario,":[90],"method":[94,104,150,160,212,224],"thus":[101],"adaptive":[107,112,125,148],"called":[110,168,184],"required":[116],"achieve":[118],"performance.":[120],"However,":[121],"use":[123],"must":[130],"constantly":[132],"monitored":[133],"order":[135],"decide":[137],"timing":[139,189],"checkpointing.":[141],"In":[142],"this":[143],"paper,":[144],"propose":[146],"less":[152],"reliance":[153],"monitoring.":[157],"Our":[158],"proposed":[159,211],"uses":[161],"timings":[163],"already":[165],"occurred":[166],"failures,":[167,171],"prior":[170],"estimate":[173],"mean":[175],"time":[176,218],"(MTTF)":[179],"next":[182],"failure,":[183],"posterior":[186,192],"failure.":[187],"The":[188,205],"predicted":[195],"based":[196],"characteristic":[199],"truncated":[202],"Weibull":[203],"simulation":[206],"show":[208],"can":[213],"reduce":[214],"total":[216],"wasted":[217],"compared":[219],"considerably":[227],"small":[228],"period.":[231]},"counts_by_year":[{"year":2020,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
