{"id":"https://openalex.org/W4387302750","doi":"https://doi.org/10.1145/3600006.3613145","title":"GEMINI: Fast Failure Recovery in Distributed Training with In-Memory Checkpoints","display_name":"GEMINI: Fast Failure Recovery in Distributed Training with In-Memory Checkpoints","publication_year":2023,"publication_date":"2023-10-03","ids":{"openalex":"https://openalex.org/W4387302750","doi":"https://doi.org/10.1145/3600006.3613145"},"language":"en","primary_location":{"id":"doi:10.1145/3600006.3613145","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3600006.3613145","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th Symposium on Operating Systems Principles","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100697510","display_name":"Zhuang Wang","orcid":"https://orcid.org/0000-0002-1614-0601"},"institutions":[{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Zhuang Wang","raw_affiliation_strings":["Department of Computer Science, Rice University, Houston, Texas, United States of America"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Rice University, Houston, Texas, United States of America","institution_ids":["https://openalex.org/I74775410"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102854276","display_name":"Zhen Jia","orcid":"https://orcid.org/0000-0003-3543-2324"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhen Jia","raw_affiliation_strings":["Amazon Web Services, Santa Clara, California, USA"],"affiliations":[{"raw_affiliation_string":"Amazon Web Services, Santa Clara, California, USA","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019249087","display_name":"Shuai Zheng","orcid":"https://orcid.org/0000-0003-3093-6486"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shuai Zheng","raw_affiliation_strings":["Amazon Web Services, Santa Clara, California, United States of America"],"affiliations":[{"raw_affiliation_string":"Amazon Web Services, Santa Clara, California, United States of America","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029095203","display_name":"Zhen Zhang","orcid":"https://orcid.org/0000-0002-0164-0849"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhen Zhang","raw_affiliation_strings":["Amazon Web Services, Santa Clara, California, United States of America"],"affiliations":[{"raw_affiliation_string":"Amazon Web Services, Santa Clara, California, United States of America","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020914269","display_name":"Xinwei Fu","orcid":"https://orcid.org/0009-0004-7822-5450"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xinwei Fu","raw_affiliation_strings":["Amazon Web Services, Santa Clara, California, United States of America"],"affiliations":[{"raw_affiliation_string":"Amazon Web Services, Santa Clara, California, United States of America","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075120727","display_name":"T. S. Eugene Ng","orcid":"https://orcid.org/0000-0003-2954-0767"},"institutions":[{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"T. S. Eugene Ng","raw_affiliation_strings":["Rice University, Houston, Texas, United States of America"],"affiliations":[{"raw_affiliation_string":"Rice University, Houston, Texas, United States of America","institution_ids":["https://openalex.org/I74775410"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101638214","display_name":"Yida Wang","orcid":"https://orcid.org/0000-0001-8165-840X"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yida Wang","raw_affiliation_strings":["Amazon Web Services, Santa Clara, California, United States of America"],"affiliations":[{"raw_affiliation_string":"Amazon Web Services, Santa Clara, California, United States of America","institution_ids":["https://openalex.org/I1311688040"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100697510"],"corresponding_institution_ids":["https://openalex.org/I74775410"],"apc_list":null,"apc_paid":null,"fwci":6.9199,"has_fulltext":false,"cited_by_count":58,"citation_normalized_percentile":{"value":0.97935534,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"364","last_page":"381"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.9926999807357788,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9890000224113464,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7290277481079102},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6554828882217407},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.6314879655838013},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.48764216899871826},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.47844743728637695},{"id":"https://openalex.org/keywords/distributed-data-store","display_name":"Distributed data store","score":0.428579717874527},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.29601970314979553}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7290277481079102},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6554828882217407},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.6314879655838013},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.48764216899871826},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.47844743728637695},{"id":"https://openalex.org/C24885549","wikidata":"https://www.wikidata.org/wiki/Q339678","display_name":"Distributed data store","level":2,"score":0.428579717874527},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.29601970314979553},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3600006.3613145","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3600006.3613145","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th Symposium on Operating Systems Principles","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5400000214576721,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"awards":[{"id":"https://openalex.org/G2064613317","display_name":null,"funder_award_id":"CNS-1815525","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G4662746506","display_name":null,"funder_award_id":"CNS-2214272","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":59,"referenced_works":["https://openalex.org/W95608104","https://openalex.org/W1502450072","https://openalex.org/W1964981582","https://openalex.org/W1966243865","https://openalex.org/W1981276685","https://openalex.org/W1999900893","https://openalex.org/W2087402357","https://openalex.org/W2126969025","https://openalex.org/W2139391817","https://openalex.org/W2146033465","https://openalex.org/W2155204206","https://openalex.org/W2165698076","https://openalex.org/W2183341477","https://openalex.org/W2194775991","https://openalex.org/W2260756217","https://openalex.org/W2296772319","https://openalex.org/W2340222647","https://openalex.org/W2612690371","https://openalex.org/W2626696598","https://openalex.org/W2734941459","https://openalex.org/W2740001873","https://openalex.org/W2900985109","https://openalex.org/W2910100551","https://openalex.org/W2914209329","https://openalex.org/W2926767350","https://openalex.org/W2944793600","https://openalex.org/W2969388332","https://openalex.org/W2970971581","https://openalex.org/W2975712713","https://openalex.org/W3021124033","https://openalex.org/W3037519745","https://openalex.org/W3044837714","https://openalex.org/W3081168214","https://openalex.org/W3097108668","https://openalex.org/W3101708369","https://openalex.org/W3115029474","https://openalex.org/W3129831491","https://openalex.org/W3129927603","https://openalex.org/W3134991928","https://openalex.org/W3137114593","https://openalex.org/W3158296418","https://openalex.org/W3190774216","https://openalex.org/W3204998121","https://openalex.org/W4212774754","https://openalex.org/W4213383978","https://openalex.org/W4220741164","https://openalex.org/W4225004481","https://openalex.org/W4226479682","https://openalex.org/W4253233651","https://openalex.org/W4288357791","https://openalex.org/W4312983671","https://openalex.org/W4372267133","https://openalex.org/W6600737549","https://openalex.org/W6682904396","https://openalex.org/W6695914560","https://openalex.org/W6713134421","https://openalex.org/W6739901393","https://openalex.org/W6779740189","https://openalex.org/W6779748563"],"related_works":["https://openalex.org/W230091440","https://openalex.org/W2233261550","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W2997094352","https://openalex.org/W3216976533","https://openalex.org/W100620283","https://openalex.org/W2495260952","https://openalex.org/W4366179611","https://openalex.org/W2996078371"],"abstract_inverted_index":{"Large":[0],"deep":[1],"learning":[2],"models":[3],"have":[4,34],"recently":[5],"garnered":[6],"substantial":[7],"attention":[8],"from":[9],"both":[10],"academia":[11],"and":[12,28],"industry.":[13],"Nonetheless,":[14],"frequent":[15],"failures":[16],"are":[17],"observed":[18],"during":[19],"large":[20],"model":[21],"training":[22,30],"due":[23,39],"to":[24,40],"large-scale":[25],"resources":[26],"involved":[27],"extended":[29],"time.":[31],"Existing":[32],"solutions":[33],"significant":[35],"failure":[36],"recovery":[37],"costs":[38],"the":[41,46],"severe":[42],"restriction":[43],"imposed":[44],"by":[45],"bandwidth":[47],"of":[48],"remote":[49],"storage":[50],"in":[51],"which":[52],"they":[53],"store":[54],"checkpoints.":[55]},"counts_by_year":[{"year":2026,"cited_by_count":6},{"year":2025,"cited_by_count":34},{"year":2024,"cited_by_count":17},{"year":2023,"cited_by_count":1}],"updated_date":"2026-04-11T08:14:18.477133","created_date":"2025-10-10T00:00:00"}
