{"id":"https://openalex.org/W7155563116","doi":"https://doi.org/10.1145/3767295.3769322","title":"Handling Network Faults in Distributed AI Training: Failover is Now an Option","display_name":"Handling Network Faults in Distributed AI Training: Failover is Now an Option","publication_year":2026,"publication_date":"2026-04-24","ids":{"openalex":"https://openalex.org/W7155563116","doi":"https://doi.org/10.1145/3767295.3769322"},"language":null,"primary_location":{"id":"doi:10.1145/3767295.3769322","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3767295.3769322","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st European Conference on Computer Systems","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3767295.3769322","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013210527","display_name":"Xin Zhe Khooi","orcid":"https://orcid.org/0009-0005-8048-8556"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Xin Zhe Khooi","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0005-8048-8556","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101871699","display_name":"Zhuo Jiang","orcid":"https://orcid.org/0000-0002-6144-7899"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuo Jiang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-6144-7899","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134558976","display_name":"Pan Xie","orcid":"https://orcid.org/0009-0008-7593-8956"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan Xie","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-7593-8956","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134555593","display_name":"Zhigang Cui","orcid":"https://orcid.org/0009-0004-7726-188X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhigang Cui","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0004-7726-188X","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046194584","display_name":"Meng Wang","orcid":"https://orcid.org/0000-0002-2829-1976"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meng Wang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-2829-1976","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134499209","display_name":"Yuze Jin","orcid":"https://orcid.org/0009-0001-2077-3786"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yuze Jin","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0001-2077-3786","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134509317","display_name":"Pengfei Huo","orcid":"https://orcid.org/0009-0000-9713-8134"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pengfei Huo","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-9713-8134","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128760700","display_name":"Dongyang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dongyang Wang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-6446-700X","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100420829","display_name":"Lulu Chen","orcid":"https://orcid.org/0000-0002-5108-8116"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lulu Chen","raw_affiliation_strings":["ByteDance, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-5108-8116","affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134511265","display_name":"Lei Wang","orcid":"https://orcid.org/0009-0002-3809-1879"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lei Wang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-3809-1879","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134520547","display_name":"Liaoyuan Feng","orcid":"https://orcid.org/0009-0005-4599-0532"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liaoyuan Feng","raw_affiliation_strings":["ByteDance, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0005-4599-0532","affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134541034","display_name":"Xiaodong Liu","orcid":"https://orcid.org/0009-0000-0296-5664"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaodong Liu","raw_affiliation_strings":["ByteDance, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0000-0296-5664","affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134519188","display_name":"Peng Li","orcid":"https://orcid.org/0009-0002-3355-2269"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng Li","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-3355-2269","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068611075","display_name":"Qinlong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qinlong Wang","raw_affiliation_strings":["ByteDance, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0007-5406-6354","affiliations":[{"raw_affiliation_string":"ByteDance, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101098174","display_name":"Yang Bai","orcid":"https://orcid.org/0009-0009-6416-0074"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang Bai","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-6416-0074","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134502009","display_name":"Yongcan Wang","orcid":"https://orcid.org/0009-0001-7215-9753"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yongcan Wang","raw_affiliation_strings":["ByteDance, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0001-7215-9753","affiliations":[{"raw_affiliation_string":"ByteDance, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134524219","display_name":"Hao Jin","orcid":"https://orcid.org/0009-0004-0540-1089"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hao Jin","raw_affiliation_strings":["ByteDance, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-0540-1089","affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134496365","display_name":"Jinshuai Sun","orcid":"https://orcid.org/0009-0000-0600-7009"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinshuai Sun","raw_affiliation_strings":["ByteDance, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0000-0600-7009","affiliations":[{"raw_affiliation_string":"ByteDance, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134548729","display_name":"Shan Lu","orcid":"https://orcid.org/0009-0002-8200-3717"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan Lu","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-8200-3717","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007309941","display_name":"Xiang Shi","orcid":"https://orcid.org/0000-0001-6179-4332"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiang Shi","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-6179-4332","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134524606","display_name":"Yingkai Zhao","orcid":"https://orcid.org/0009-0008-4775-0438"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yingkai Zhao","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-4775-0438","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5117634641","display_name":"Haiquan Chen","orcid":"https://orcid.org/0000-0003-1689-0549"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haiquan Chen","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-8336-8065","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134532355","display_name":"Yi Li","orcid":"https://orcid.org/0009-0004-3069-9948"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi Li","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0004-3069-9948","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049104908","display_name":"Jianxi Ye","orcid":null},"institutions":[{"id":"https://openalex.org/I58610484","display_name":"Seattle University","ror":"https://ror.org/02jqc0m91","country_code":"US","type":"education","lineage":["https://openalex.org/I58610484"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jianxi Ye","raw_affiliation_strings":["ByteDance, Seattle, USA"],"raw_orcid":"https://orcid.org/0009-0007-3395-3624","affiliations":[{"raw_affiliation_string":"ByteDance, Seattle, USA","institution_ids":["https://openalex.org/I58610484"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101428727","display_name":"Mun Choon Chan","orcid":"https://orcid.org/0000-0002-6563-275X"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Mun Choon Chan","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-6563-275X","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":25,"corresponding_author_ids":["https://openalex.org/A5013210527"],"corresponding_institution_ids":["https://openalex.org/I165932596"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.95401986,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"263","last_page":"278"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.3206999897956848,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.3206999897956848,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.05429999902844429,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.03970000147819519,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/failover","display_name":"Failover","score":0.8237000107765198},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.3212999999523163},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.30550000071525574},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.27239999175071716},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.2624000012874603}],"concepts":[{"id":"https://openalex.org/C109751979","wikidata":"https://www.wikidata.org/wiki/Q998767","display_name":"Failover","level":2,"score":0.8237000107765198},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5906999707221985},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.3212999999523163},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.32030001282691956},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.30550000071525574},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.27239999175071716},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C107094494","wikidata":"https://www.wikidata.org/wiki/Q428453","display_name":"Fault tree analysis","level":2,"score":0.23810000717639923},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.23280000686645508},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2321999967098236}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3767295.3769322","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3767295.3769322","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st European Conference on Computer Systems","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3767295.3769322","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3767295.3769322","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st European Conference on Computer Systems","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2126969025","https://openalex.org/W2625782120","https://openalex.org/W3081168214","https://openalex.org/W3086105743","https://openalex.org/W3160031425","https://openalex.org/W3175449831","https://openalex.org/W4386072002","https://openalex.org/W4398796293","https://openalex.org/W4401176799"],"related_works":[],"abstract_inverted_index":{"Distributed":[0],"AI":[1,134],"training":[2,28,62,135],"often":[3],"suffers":[4],"from":[5],"network":[6,54,76],"faults.":[7],"Network":[8],"faults,":[9],"especially":[10],"at":[11],"the":[12,52],"last":[13],"hop":[14],"between":[15],"a":[16,19,39,44,75],"switch":[17],"and":[18,31,92],"host,":[20],"result":[21],"in":[22,27,128],"loss":[23],"of":[24],"connectivity,":[25],"resulting":[26],"job":[29],"stalls":[30],"eventual":[32],"failure.":[33],"This":[34],"is":[35],"typically":[36],"managed":[37],"through":[38],"fail-stop":[40],"mechanism,":[41],"followed":[42],"by":[43,67],"restart,":[45],"incurring":[46],"significant":[47,126],"inefficiencies.":[48],"We":[49],"present":[50],"ReCCL,":[51],"first":[53],"fault-tolerant":[55],"collective":[56],"communication":[57,83,98],"library":[58],"(CCL)":[59],"that":[60,103,118],"allows":[61],"progress":[63],"to":[64,71,96,124],"be":[65,121],"preserved":[66],"seamlessly":[68,108],"failing":[69],"over":[70],"alternate":[72],"paths":[73],"when":[74],"fault":[77],"occurs.":[78],"During":[79],"failover,":[80],"ReCCL":[81,104],"keeps":[82],"states":[84],"synchronized":[85],"while":[86],"using":[87],"dynamic":[88],"channel":[89],"load":[90],"balancing":[91],"intra-host":[93],"GPU":[94,129],"routing":[95],"improve":[97],"performance.":[99],"Our":[100],"evaluations":[101],"demonstrate":[102,117],"can":[105,120],"perform":[106],"failover":[107,119],"with":[109],"minimal":[110],"performance":[111],"losses.":[112],"Additionally,":[113],"our":[114],"simulations":[115],"also":[116],"effectively":[122],"used":[123],"achieve":[125],"savings":[127],"hours":[130],"for":[131],"large-scale":[132],"distributed":[133],"workloads.":[136]},"counts_by_year":[],"updated_date":"2026-04-29T09:16:38.111599","created_date":"2026-04-25T00:00:00"}
