{"id":"https://openalex.org/W2064388050","doi":"https://doi.org/10.1145/2063384.2063428","title":"Checkpointing strategies for parallel jobs","display_name":"Checkpointing strategies for parallel jobs","publication_year":2011,"publication_date":"2011-11-08","ids":{"openalex":"https://openalex.org/W2064388050","doi":"https://doi.org/10.1145/2063384.2063428","mag":"2064388050"},"language":"en","primary_location":{"id":"doi:10.1145/2063384.2063428","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2063384.2063428","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5062080875","display_name":"Marin Bougeret","orcid":"https://orcid.org/0000-0002-9910-4656"},"institutions":[{"id":"https://openalex.org/I113428412","display_name":"\u00c9cole Normale Sup\u00e9rieure de Lyon","ror":"https://ror.org/04zmssz18","country_code":"FR","type":"education","lineage":["https://openalex.org/I113428412","https://openalex.org/I203339264"]}],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Marin Bougeret","raw_affiliation_strings":["ENS Lyon, France","ENS-Lyon, France#TAB#"],"affiliations":[{"raw_affiliation_string":"ENS Lyon, France","institution_ids":["https://openalex.org/I113428412"]},{"raw_affiliation_string":"ENS-Lyon, France#TAB#","institution_ids":["https://openalex.org/I113428412"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022674890","display_name":"Henri Casanova","orcid":"https://orcid.org/0000-0001-6310-0365"},"institutions":[{"id":"https://openalex.org/I117965899","display_name":"University of Hawai\u02bbi at M\u0101noa","ror":"https://ror.org/01wspgy28","country_code":"US","type":"education","lineage":["https://openalex.org/I117965899"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Henri Casanova","raw_affiliation_strings":["Univ. of Hawai'i at M\u0101noa, Honolulu","Univ. of Hawai'i at Manoa, Honolulu, USA"],"affiliations":[{"raw_affiliation_string":"Univ. of Hawai'i at M\u0101noa, Honolulu","institution_ids":["https://openalex.org/I117965899"]},{"raw_affiliation_string":"Univ. of Hawai'i at Manoa, Honolulu, USA","institution_ids":["https://openalex.org/I117965899"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038963808","display_name":"Mika\u00ebl Rabie","orcid":"https://orcid.org/0000-0001-6782-7625"},"institutions":[{"id":"https://openalex.org/I113428412","display_name":"\u00c9cole Normale Sup\u00e9rieure de Lyon","ror":"https://ror.org/04zmssz18","country_code":"FR","type":"education","lineage":["https://openalex.org/I113428412","https://openalex.org/I203339264"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Mikael Rabie","raw_affiliation_strings":["ENS Lyon, France","ENS-Lyon, France#TAB#"],"affiliations":[{"raw_affiliation_string":"ENS Lyon, France","institution_ids":["https://openalex.org/I113428412"]},{"raw_affiliation_string":"ENS-Lyon, France#TAB#","institution_ids":["https://openalex.org/I113428412"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001838181","display_name":"Yves Robert","orcid":"https://orcid.org/0000-0003-2361-055X"},"institutions":[{"id":"https://openalex.org/I113428412","display_name":"\u00c9cole Normale Sup\u00e9rieure de Lyon","ror":"https://ror.org/04zmssz18","country_code":"FR","type":"education","lineage":["https://openalex.org/I113428412","https://openalex.org/I203339264"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Yves Robert","raw_affiliation_strings":["ENS Lyon, France","ENS-Lyon, France#TAB#"],"affiliations":[{"raw_affiliation_string":"ENS Lyon, France","institution_ids":["https://openalex.org/I113428412"]},{"raw_affiliation_string":"ENS-Lyon, France#TAB#","institution_ids":["https://openalex.org/I113428412"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086100728","display_name":"Fr\u00e9d\u00e9ric Vivien","orcid":"https://orcid.org/0000-0002-0663-6152"},"institutions":[{"id":"https://openalex.org/I1326498283","display_name":"Institut national de recherche en informatique et en automatique","ror":"https://ror.org/02kvxyf05","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1326498283"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Fr\u00e9d\u00e9ric Vivien","raw_affiliation_strings":["INRIA, Lyon, France","[INRIA,Lyon,France]"],"affiliations":[{"raw_affiliation_string":"INRIA, Lyon, France","institution_ids":["https://openalex.org/I1326498283"]},{"raw_affiliation_string":"[INRIA,Lyon,France]","institution_ids":["https://openalex.org/I1326498283"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5062080875"],"corresponding_institution_ids":["https://openalex.org/I113428412"],"apc_list":null,"apc_paid":null,"fwci":13.20778788,"has_fulltext":false,"cited_by_count":94,"citation_normalized_percentile":{"value":0.98910862,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13553","display_name":"Age of Information Optimization","score":0.9918000102043152,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10974","display_name":"Advanced Queuing Theory Analysis","score":0.9898999929428101,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8405079245567322},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.7366501092910767},{"id":"https://openalex.org/keywords/weibull-distribution","display_name":"Weibull distribution","score":0.63787841796875},{"id":"https://openalex.org/keywords/dynamic-programming","display_name":"Dynamic programming","score":0.5811886787414551},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.5804402828216553},{"id":"https://openalex.org/keywords/exponential-function","display_name":"Exponential function","score":0.5246748328208923},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5099173784255981},{"id":"https://openalex.org/keywords/exponential-distribution","display_name":"Exponential distribution","score":0.5067155957221985},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.49498623609542847},{"id":"https://openalex.org/keywords/exponential-growth","display_name":"Exponential growth","score":0.4439542293548584},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.44320571422576904},{"id":"https://openalex.org/keywords/mathematical-optimization","display_name":"Mathematical optimization","score":0.32952994108200073},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.30178284645080566},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.09237471222877502},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.07767415046691895}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8405079245567322},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.7366501092910767},{"id":"https://openalex.org/C173291955","wikidata":"https://www.wikidata.org/wiki/Q732332","display_name":"Weibull distribution","level":2,"score":0.63787841796875},{"id":"https://openalex.org/C37404715","wikidata":"https://www.wikidata.org/wiki/Q380679","display_name":"Dynamic programming","level":2,"score":0.5811886787414551},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.5804402828216553},{"id":"https://openalex.org/C151376022","wikidata":"https://www.wikidata.org/wiki/Q168698","display_name":"Exponential function","level":2,"score":0.5246748328208923},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5099173784255981},{"id":"https://openalex.org/C55350006","wikidata":"https://www.wikidata.org/wiki/Q237193","display_name":"Exponential distribution","level":2,"score":0.5067155957221985},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.49498623609542847},{"id":"https://openalex.org/C75235859","wikidata":"https://www.wikidata.org/wiki/Q582659","display_name":"Exponential growth","level":2,"score":0.4439542293548584},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.44320571422576904},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.32952994108200073},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30178284645080566},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.09237471222877502},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.07767415046691895},{"id":"https://openalex.org/C78519656","wikidata":"https://www.wikidata.org/wiki/Q101333","display_name":"Mechanical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2063384.2063428","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2063384.2063428","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/8","score":0.6000000238418579,"display_name":"Decent work and economic growth"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W94439627","https://openalex.org/W188363600","https://openalex.org/W1493970911","https://openalex.org/W1558516248","https://openalex.org/W1870609547","https://openalex.org/W1969359541","https://openalex.org/W1984564341","https://openalex.org/W2015651973","https://openalex.org/W2016411419","https://openalex.org/W2033656974","https://openalex.org/W2041250459","https://openalex.org/W2077911375","https://openalex.org/W2098259808","https://openalex.org/W2100970777","https://openalex.org/W2102576651","https://openalex.org/W2106017997","https://openalex.org/W2109192777","https://openalex.org/W2117667603","https://openalex.org/W2119567691","https://openalex.org/W2127433432","https://openalex.org/W2131051243","https://openalex.org/W2131629153","https://openalex.org/W2133046454","https://openalex.org/W2150255127","https://openalex.org/W2150871235","https://openalex.org/W2161537327","https://openalex.org/W2169246841","https://openalex.org/W2245014709","https://openalex.org/W2317514805","https://openalex.org/W4231150350","https://openalex.org/W4246678129","https://openalex.org/W4285719527","https://openalex.org/W4298023569","https://openalex.org/W7001894244"],"related_works":["https://openalex.org/W3083898685","https://openalex.org/W2037820527","https://openalex.org/W1973754976","https://openalex.org/W189075692","https://openalex.org/W1620508096","https://openalex.org/W2357734103","https://openalex.org/W1490217699","https://openalex.org/W3195534432","https://openalex.org/W2766105656","https://openalex.org/W1537624148"],"abstract_inverted_index":{"This":[0],"work":[1,75,93],"provides":[2,82],"an":[3,15],"analysis":[4],"of":[5,26,47,74,97,101,124,152],"checkpointing":[6,57,103],"strategies":[7],"for":[8,37,86,181],"minimizing":[9,87],"expected":[10,89],"job":[11,98],"execution":[12,90],"times":[13],"in":[14,149],"environment":[16],"that":[17,55,112,139,162,172],"is":[18,50,58],"subject":[19],"to":[20,44,70],"processor":[21],"failures.":[22,154],"In":[23],"the":[24,34,45,51,72,78,88,119,150],"case":[25,151],"both":[27],"sequential":[28],"and":[29,100],"parallel":[30,102],"jobs,":[31],"we":[32,64],"give":[33],"optimal":[35],"solution":[36],"exponentially":[38],"distributed":[39,62],"failure":[40,164],"inter-arrival":[41],"times,":[42],"which,":[43],"best":[46],"our":[48,133,140,173],"knowledge,":[49],"first":[52,106],"rigorous":[53],"proof":[54],"periodic":[56],"optimal.":[59],"For":[60],"non-exponentially":[61],"failures,":[63],"develop":[65],"a":[66,83],"dynamic":[67,141,174],"programming":[68,142,175],"algorithm":[69,143,176],"maximize":[71],"amount":[73],"completed":[76],"before":[77],"next":[79],"failure,":[80],"which":[81],"good":[84],"heuristic":[85],"time.":[91],"Our":[92],"considers":[94],"various":[95],"models":[96],"parallelism":[99],"overhead.":[104],"We":[105,155],"perform":[107],"extensive":[108],"simulation":[109,160],"experiments":[110,161],"assuming":[111],"failures":[113],"follow":[114],"Exponential":[115],"or":[116],"Weibull":[117,153],"distributions,":[118],"latter":[120],"being":[121],"more":[122],"representative":[123],"real-world":[125,182],"systems.":[126],"The":[127],"obtained":[128],"results":[129,158,170],"not":[130],"only":[131],"corroborate":[132],"theoretical":[134],"findings,":[135],"but":[136],"also":[137],"show":[138],"significantly":[144,177],"outperforms":[145,178],"previously":[146],"proposed":[147],"solutions":[148,180],"then":[156],"discuss":[157],"from":[159,166],"use":[163],"logs":[165],"production":[167],"clusters.":[168,183],"These":[169],"confirm":[171],"existing":[179]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":6},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":8},{"year":2017,"cited_by_count":4},{"year":2016,"cited_by_count":9},{"year":2015,"cited_by_count":14},{"year":2014,"cited_by_count":12},{"year":2013,"cited_by_count":16},{"year":2012,"cited_by_count":8}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
