{"id":"https://openalex.org/W4400770595","doi":"https://doi.org/10.1109/tpds.2024.3429625","title":"Swift: Expedited Failure Recovery for Large-Scale DNN Training","display_name":"Swift: Expedited Failure Recovery for Large-Scale DNN Training","publication_year":2024,"publication_date":"2024-07-18","ids":{"openalex":"https://openalex.org/W4400770595","doi":"https://doi.org/10.1109/tpds.2024.3429625"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2024.3429625","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3429625","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113411156","display_name":"Yueqiao Zhong","orcid":"https://orcid.org/0000-0003-3859-1566"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Yuchen Zhong","raw_affiliation_strings":["University of Hong Kong, Hong Kong, SAR, China"],"raw_orcid":"https://orcid.org/0000-0003-3859-1566","affiliations":[{"raw_affiliation_string":"University of Hong Kong, Hong Kong, SAR, China","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020661868","display_name":"Guangming Sheng","orcid":"https://orcid.org/0000-0003-3395-3994"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Guangming Sheng","raw_affiliation_strings":["University of Hong Kong, Hong Kong, SAR, China"],"raw_orcid":"https://orcid.org/0000-0003-3395-3994","affiliations":[{"raw_affiliation_string":"University of Hong Kong, Hong Kong, SAR, China","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101415161","display_name":"Juncheng Liu","orcid":"https://orcid.org/0000-0002-5895-0581"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Juncheng Liu","raw_affiliation_strings":["OneFlow Inc., Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"OneFlow Inc., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101489641","display_name":"Yuan Jinhui","orcid":"https://orcid.org/0000-0002-0700-2645"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinhui Yuan","raw_affiliation_strings":["OneFlow Inc., Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"OneFlow Inc., Beijing, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5012597518","display_name":"Chuan Wu","orcid":"https://orcid.org/0000-0002-3144-4398"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Chuan Wu","raw_affiliation_strings":["University of Hong Kong, Hong Kong, SAR, China"],"raw_orcid":"https://orcid.org/0000-0002-3144-4398","affiliations":[{"raw_affiliation_string":"University of Hong Kong, Hong Kong, SAR, China","institution_ids":["https://openalex.org/I889458895"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5113411156"],"corresponding_institution_ids":["https://openalex.org/I889458895"],"apc_list":null,"apc_paid":null,"fwci":0.648,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.67014328,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":"35","issue":"9","first_page":"1644","last_page":"1656"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11993","display_name":"Atomic and Subatomic Physics Research","score":0.7602999806404114,"subfield":{"id":"https://openalex.org/subfields/3107","display_name":"Atomic and Molecular Physics, and Optics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11993","display_name":"Atomic and Subatomic Physics Research","score":0.7602999806404114,"subfield":{"id":"https://openalex.org/subfields/3107","display_name":"Atomic and Molecular Physics, and Optics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7857420444488525},{"id":"https://openalex.org/keywords/swift","display_name":"Swift","score":0.7361571192741394},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5887956023216248},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5673072338104248},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3796979784965515}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7857420444488525},{"id":"https://openalex.org/C116188536","wikidata":"https://www.wikidata.org/wiki/Q17118377","display_name":"Swift","level":2,"score":0.7361571192741394},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5887956023216248},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5673072338104248},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3796979784965515},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpds.2024.3429625","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3429625","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},{"id":"pmh:oai:hub.hku.hk:10722/359567","is_oa":false,"landing_page_url":"https://hub.hku.hk/handle/10722/359567","pdf_url":null,"source":{"id":"https://openalex.org/S4377196271","display_name":"The HKU Scholars Hub (University of Hong Kong)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I889458895","host_organization_name":"University of Hong Kong","host_organization_lineage":["https://openalex.org/I889458895"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6000000238418579,"id":"https://metadata.un.org/sdg/13","display_name":"Climate action"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":52,"referenced_works":["https://openalex.org/W1517447268","https://openalex.org/W2081409107","https://openalex.org/W2117539524","https://openalex.org/W2119738171","https://openalex.org/W2153399951","https://openalex.org/W2186615578","https://openalex.org/W2266486761","https://openalex.org/W2302255633","https://openalex.org/W2336650964","https://openalex.org/W2489529491","https://openalex.org/W2622263826","https://openalex.org/W2798515322","https://openalex.org/W2892474000","https://openalex.org/W2896457183","https://openalex.org/W2919372546","https://openalex.org/W2963390429","https://openalex.org/W2964137095","https://openalex.org/W2969388332","https://openalex.org/W2973727699","https://openalex.org/W2981937105","https://openalex.org/W2991040477","https://openalex.org/W3094502228","https://openalex.org/W3132107458","https://openalex.org/W3140077234","https://openalex.org/W3204998121","https://openalex.org/W3205803342","https://openalex.org/W3210871626","https://openalex.org/W6631190155","https://openalex.org/W6679815717","https://openalex.org/W6686509673","https://openalex.org/W6703420464","https://openalex.org/W6713134421","https://openalex.org/W6739622702","https://openalex.org/W6741529945","https://openalex.org/W6745245109","https://openalex.org/W6747620207","https://openalex.org/W6753584990","https://openalex.org/W6755257872","https://openalex.org/W6757817989","https://openalex.org/W6758283263","https://openalex.org/W6762287338","https://openalex.org/W6767997687","https://openalex.org/W6778883912","https://openalex.org/W6779863968","https://openalex.org/W6781728138","https://openalex.org/W6784836957","https://openalex.org/W6787673396","https://openalex.org/W6791204390","https://openalex.org/W6792188503","https://openalex.org/W6794474486","https://openalex.org/W6804052296","https://openalex.org/W6811928498"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W3198734187","https://openalex.org/W3175001965","https://openalex.org/W2350024083","https://openalex.org/W4250183494","https://openalex.org/W3130985054","https://openalex.org/W608019500","https://openalex.org/W2478736918","https://openalex.org/W3130134152"],"abstract_inverted_index":{"As":[0],"the":[1,40,80,98,105,108,113,117,120,144,148,173,191],"size":[2],"of":[3,39,92,97,107,119],"deep":[4,73],"learning":[5],"models":[6,55],"gets":[7],"larger":[8],"and":[9,15,22,30,45,56,88,115,142,178,195],"larger,":[10],"training":[11,76,86,200,226],"takes":[12],"longer":[13],"time":[14,177,194,227],"more":[16,21,23],"resources,":[17],"making":[18,93],"fault":[19],"tolerance":[20],"critical.":[24],"Existing":[25],"state-of-the-art":[26,207,230],"methods":[27,208],"like":[28],"CheckFreq":[29],"Elastic":[31],"Horovod":[32],"need":[33],"to":[34,58,146,161,206,221,229],"back":[35],"up":[36,220],"a":[37,67,131,152],"copy":[38,96],"model":[41,89,99,109,121,212],"state":[42,110,122,150],"(i.e.,":[43],"parameters":[44],"optimizer":[46],"states)":[47],"in":[48,123,224],"memory,":[49],"which":[50,138],"is":[51,156],"costly":[52],"for":[53,71,126],"large":[54],"leads":[57],"non-trivial":[59],"overhead.":[60,182],"This":[61],"article":[62],"presents":[63],"<sc":[64,101,186,214],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[65,102,187,215],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Swift</small>,":[66],"novel":[68],"recovery":[69,82,164,176,193],"design":[70],"distributed":[72,157],"neural":[74],"network":[75],"that":[77,185],"significantly":[78,189],"reduces":[79,190],"failure":[81,114,127,163,192],"overhead":[83],"without":[84,209],"affecting":[85],"throughput":[87,201],"accuracy.":[90,213],"Instead":[91],"an":[94],"additional":[95],"state,":[100],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Swift</small>":[103,188,216],"resolves":[104],"inconsistencies":[106],"caused":[111],"by":[112],"exploits":[116],"replicas":[118,135],"data":[124,141,170,180],"parallelism":[125],"recovery.":[128],"We":[129,166],"propose":[130],"logging-based":[132],"approach":[133],"when":[134],"are":[136],"unavailable,":[137],"records":[139],"intermediate":[140,169,179],"replays":[143],"computation":[145],"recover":[147],"lost":[149],"upon":[151],"failure.":[153],"The":[154],"re-computation":[155],"across":[158],"multiple":[159],"machines":[160],"accelerate":[162],"further.":[165],"also":[167,218],"log":[168],"selectively,":[171],"exploring":[172],"trade-off":[174],"between":[175],"storage":[181],"Evaluations":[183],"show":[184],"achieves":[196],"similar":[197],"or":[198],"better":[199],"during":[202],"failure-free":[203],"execution":[204],"compared":[205,228],"degrading":[210],"final":[211],"can":[217],"achieve":[219],"1.16x":[222],"speedup":[223],"total":[225],"methods.":[231]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
