{"id":"https://openalex.org/W2056966287","doi":"https://doi.org/10.1109/cluster.2014.6968778","title":"To checkpoint or not to checkpoint: Understanding energy-performance-I/O tradeoffs in HPC checkpointing","display_name":"To checkpoint or not to checkpoint: Understanding energy-performance-I/O tradeoffs in HPC checkpointing","publication_year":2014,"publication_date":"2014-09-01","ids":{"openalex":"https://openalex.org/W2056966287","doi":"https://doi.org/10.1109/cluster.2014.6968778","mag":"2056966287"},"language":"en","primary_location":{"id":"doi:10.1109/cluster.2014.6968778","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cluster.2014.6968778","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2014 IEEE International Conference on Cluster Computing (CLUSTER)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041608911","display_name":"Nosayba El-Sayed","orcid":null},"institutions":[{"id":"https://openalex.org/I185261750","display_name":"University of Toronto","ror":"https://ror.org/03dbr7087","country_code":"CA","type":"education","lineage":["https://openalex.org/I185261750"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Nosayba El-Sayed","raw_affiliation_strings":["Department of Computer Science, University of Toronto","Department of Computer Science University of Toronto, Canada#TAB#"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Toronto","institution_ids":["https://openalex.org/I185261750"]},{"raw_affiliation_string":"Department of Computer Science University of Toronto, Canada#TAB#","institution_ids":["https://openalex.org/I185261750"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040893798","display_name":"Bianca Schroeder","orcid":"https://orcid.org/0000-0003-3289-1824"},"institutions":[{"id":"https://openalex.org/I185261750","display_name":"University of Toronto","ror":"https://ror.org/03dbr7087","country_code":"CA","type":"education","lineage":["https://openalex.org/I185261750"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Bianca Schroeder","raw_affiliation_strings":["Department of Computer Science, University of Toronto","Department of Computer Science University of Toronto, Canada#TAB#"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Toronto","institution_ids":["https://openalex.org/I185261750"]},{"raw_affiliation_string":"Department of Computer Science University of Toronto, Canada#TAB#","institution_ids":["https://openalex.org/I185261750"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5041608911"],"corresponding_institution_ids":["https://openalex.org/I185261750"],"apc_list":null,"apc_paid":null,"fwci":3.4481,"has_fulltext":false,"cited_by_count":23,"citation_normalized_percentile":{"value":0.93195855,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"93","last_page":"102"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8450444936752319},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.6356770992279053},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6201032400131226},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5982338190078735},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5795022249221802},{"id":"https://openalex.org/keywords/energy-consumption","display_name":"Energy consumption","score":0.5290179252624512},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5268368124961853},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.4683386981487274},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.44291016459465027},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4238438904285431},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3886341452598572},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2659029960632324},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.07329043745994568}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8450444936752319},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.6356770992279053},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6201032400131226},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5982338190078735},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5795022249221802},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.5290179252624512},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5268368124961853},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.4683386981487274},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.44291016459465027},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4238438904285431},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3886341452598572},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2659029960632324},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.07329043745994568},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cluster.2014.6968778","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cluster.2014.6968778","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2014 IEEE International Conference on Cluster Computing (CLUSTER)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.9100000262260437,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320334593","display_name":"Natural Sciences and Engineering Research Council of Canada","ror":"https://ror.org/01h531d29"},{"id":"https://openalex.org/F4320338304","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W1977792483","https://openalex.org/W1991872262","https://openalex.org/W2003569970","https://openalex.org/W2008909725","https://openalex.org/W2033656974","https://openalex.org/W2040509723","https://openalex.org/W2056966287","https://openalex.org/W2060836295","https://openalex.org/W2077711217","https://openalex.org/W2133046454","https://openalex.org/W2142812297","https://openalex.org/W2152023716","https://openalex.org/W2200913013","https://openalex.org/W6633553838","https://openalex.org/W6682145291","https://openalex.org/W6687586697"],"related_works":["https://openalex.org/W17155033","https://openalex.org/W3207760230","https://openalex.org/W1496222301","https://openalex.org/W1590307681","https://openalex.org/W2536018345","https://openalex.org/W4312814274","https://openalex.org/W1862835629","https://openalex.org/W2136799148","https://openalex.org/W2897533804","https://openalex.org/W2890506991"],"abstract_inverted_index":{"As":[0],"the":[1,50,71,87,118,142,146,177,183,229,235],"scale":[2],"of":[3,49,90,117,126,182,223,231],"high-performance":[4],"computing":[5],"(HPC)":[6],"clusters":[7],"continues":[8],"to":[9,29,95,158,250],"grow,":[10],"their":[11],"increasing":[12],"failure":[13,192],"rates":[14],"and":[15,52,80,154,175,179,247],"energy":[16,53,88,99,147,163,207,253],"consumption":[17],"levels":[18],"are":[19,27,241],"emerging":[20],"as":[21,135,137],"two":[22],"serious":[23],"design":[24],"concerns":[25],"that":[26,132,220,240],"expected":[28],"become":[30],"more":[31,242],"challenging":[32],"in":[33,141],"future":[34],"Exascale":[35],"systems.":[36],"Therefore,":[37],"efficiently":[38],"running":[39],"systems":[40],"at":[41],"such":[42],"large":[43],"scales":[44],"requires":[45],"an":[46,114,124],"in-depth":[47],"understanding":[48],"performance":[51,84,120],"costs":[54],"associated":[55,122],"with":[56,123,165,209,254],"different":[57,184],"fault":[58,65],"tolerance":[59,66],"techniques.":[60],"The":[61],"most":[62],"commonly":[63],"used":[64],"method":[67],"is":[68],"checkpoint/restart.":[69],"Over":[70],"years,":[72],"checkpoint":[73,127,160],"scheduling":[74,128,161],"policies":[75,92,131,239],"have":[76],"been":[77],"traditionally":[78],"optimized":[79],"analysed":[81],"from":[82,194],"a":[83,150,168,210,255],"perspective.":[85],"Understanding":[86],"profile":[89],"these":[91],"or":[93,166],"how":[94,249],"optimize":[96,159,251],"them":[97],"for":[98,149,162,204,244,252],"savings":[100,208],"(rather":[101],"than":[102],"performance),":[103],"remain":[104],"not":[105],"very":[106],"well":[107,136],"understood.":[108],"In":[109],"this":[110],"paper,":[111],"we":[112,133],"provide":[113,155],"extensive":[115],"analysis":[116],"energy/":[119],"tradeoffs":[121],"array":[125],"policies,":[129],"including":[130],"propose,":[134],"few":[138],"existing":[139],"ones":[140],"literature.":[143],"We":[144,172,226],"estimate":[145],"overhead":[148,213],"given":[151],"checkpointing":[152,218,233],"policy,":[153],"simple":[156],"formulas":[157],"savings,":[164,246],"without":[167],"bound":[169,256],"on":[170,191,234,257],"runtime.":[171],"then":[173],"evaluate":[174],"compare":[176],"runtime-optimized":[178],"energy-optimized":[180,232],"versions":[181],"methods":[185,219],"using":[186,215],"trace":[187],"driven":[188],"simulations":[189],"based":[190],"logs":[193],"10":[195],"production":[196],"HPC":[197,224],"clusters.":[198],"Our":[199],"results":[200],"show":[201],"ample":[202],"room":[203],"achieving":[205],"high":[206],"low":[211],"runtime":[212],"when":[214],"non-constant":[216],"(adaptive)":[217],"exploit":[221],"characteristics":[222],"failures.":[225],"also":[227],"analyze":[228],"impact":[230],"storage":[236],"subsystem,":[237],"identify":[238],"optimal":[243],"I/O":[245,258],"study":[248],"time.":[259]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":3},{"year":2019,"cited_by_count":2},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":5},{"year":2016,"cited_by_count":4},{"year":2014,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
