{"id":"https://openalex.org/W4414736146","doi":"https://doi.org/10.1145/3731569.3764838","title":"Robust LLM Training Infrastructure at ByteDance","display_name":"Robust LLM Training Infrastructure at ByteDance","publication_year":2025,"publication_date":"2025-10-01","ids":{"openalex":"https://openalex.org/W4414736146","doi":"https://doi.org/10.1145/3731569.3764838"},"language":"en","primary_location":{"id":"doi:10.1145/3731569.3764838","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731569.3764838","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3731569.3764838","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093967737","display_name":"Borui Wan","orcid":"https://orcid.org/0009-0008-5902-1611"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Borui Wan","raw_affiliation_strings":["School of Computing and Data Science, The University of Hong Kong, Hong Kong, Hong Kong"],"raw_orcid":"https://orcid.org/0009-0008-5902-1611","affiliations":[{"raw_affiliation_string":"School of Computing and Data Science, The University of Hong Kong, Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010280075","display_name":"G.L. Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gaohong Liu","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-2551-7879","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zuquan Song","orcid":"https://orcid.org/0009-0008-7576-6162"},"institutions":[{"id":"https://openalex.org/I58610484","display_name":"Seattle University","ror":"https://ror.org/02jqc0m91","country_code":"US","type":"education","lineage":["https://openalex.org/I58610484"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zuquan Song","raw_affiliation_strings":["ByteDance, Seattle, USA"],"raw_orcid":"https://orcid.org/0009-0008-7576-6162","affiliations":[{"raw_affiliation_string":"ByteDance, Seattle, USA","institution_ids":["https://openalex.org/I58610484"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jun Wang","orcid":"https://orcid.org/0009-0000-8493-0624"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jun Wang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-8493-0624","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yun Zhang","orcid":"https://orcid.org/0009-0009-9159-3107"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yun Zhang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-9159-3107","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020661868","display_name":"Guangming Sheng","orcid":"https://orcid.org/0000-0003-3395-3994"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Guangming Sheng","raw_affiliation_strings":["School of Computing and Data Science, The University of Hong Kong, Hong Kong, Hong Kong"],"raw_orcid":"https://orcid.org/0000-0003-3395-3994","affiliations":[{"raw_affiliation_string":"School of Computing and Data Science, The University of Hong Kong, Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034563547","display_name":"Shuguang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shuguang Wang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-4249-4092","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Houmin Wei","orcid":"https://orcid.org/0009-0008-0066-8154"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Houmin Wei","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-0066-8154","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Chenyuan Wang","orcid":"https://orcid.org/0009-0007-5553-2601"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chenyuan Wang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0007-5553-2601","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112332166","display_name":"W. Z. Lou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weiqiang Lou","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-7390-0581","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112972353","display_name":"Xi Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xi Yang","raw_affiliation_strings":["ByteDance, San Jose, USA"],"raw_orcid":"https://orcid.org/0009-0006-2195-8834","affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Mofan Zhang","orcid":"https://orcid.org/0009-0005-1177-9771"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mofan Zhang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-1177-9771","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Kaihua Jiang","orcid":"https://orcid.org/0009-0006-4598-3807"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaihua Jiang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-4598-3807","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Cheng Ren","orcid":"https://orcid.org/0009-0004-9946-8320"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng Ren","raw_affiliation_strings":["ByteDance, San Jose, USA"],"raw_orcid":"https://orcid.org/0009-0004-9946-8320","affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xiaoyun Zhi","orcid":"https://orcid.org/0009-0002-5403-9720"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaoyun Zhi","raw_affiliation_strings":["ByteDance, San Jose, USA"],"raw_orcid":"https://orcid.org/0009-0002-5403-9720","affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074205338","display_name":"Menghan Yu","orcid":"https://orcid.org/0009-0008-8240-8144"},"institutions":[{"id":"https://openalex.org/I58610484","display_name":"Seattle University","ror":"https://ror.org/02jqc0m91","country_code":"US","type":"education","lineage":["https://openalex.org/I58610484"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Menghan Yu","raw_affiliation_strings":["ByteDance, Seattle, USA"],"raw_orcid":"https://orcid.org/0009-0008-8240-8144","affiliations":[{"raw_affiliation_string":"ByteDance, Seattle, USA","institution_ids":["https://openalex.org/I58610484"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhe Nan","orcid":"https://orcid.org/0009-0001-6311-0018"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhe Nan","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-6311-0018","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhuolin Zheng","orcid":"https://orcid.org/0009-0007-2228-1527"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuolin Zheng","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0007-2228-1527","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037661446","display_name":"Baoquan Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Baoquan Zhong","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-0833-5619","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Qinlong Wang","orcid":"https://orcid.org/0009-0007-5406-6354"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qinlong Wang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0007-5406-6354","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Huan Yu","orcid":"https://orcid.org/0009-0000-8102-5055"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huan Yu","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-8102-5055","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056091188","display_name":"Jinxin Chi","orcid":"https://orcid.org/0000-0002-9803-6997"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinxin Chi","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0003-3708-3065","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Wang Zhang","orcid":"https://orcid.org/0009-0001-2436-8761"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang Zhang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-2436-8761","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yuhan Li","orcid":"https://orcid.org/0009-0001-5310-9523"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuhan Li","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-5310-9523","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101361287","display_name":"Zi-Xian Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zixian Du","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0003-9750-7157","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Sida Zhao","orcid":"https://orcid.org/0009-0006-3100-2772"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sida Zhao","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-3100-2772","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065495256","display_name":"Yongqiang Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yongqiang Zhang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-0351-5272","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jingzhe Tang","orcid":"https://orcid.org/0009-0008-8094-8112"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jingzhe Tang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-8094-8112","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090569171","display_name":"Zherui Liu","orcid":"https://orcid.org/0009-0002-4910-6095"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zherui Liu","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-4910-6095","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012597518","display_name":"Chuan Wu","orcid":"https://orcid.org/0000-0002-3144-4398"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Chuan Wu","raw_affiliation_strings":["School of Computing and Data Science, The University of Hong Kong, Hong Kong, Hong Kong"],"raw_orcid":"https://orcid.org/0000-0002-3144-4398","affiliations":[{"raw_affiliation_string":"School of Computing and Data Science, The University of Hong Kong, Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013856812","display_name":"Yanghua Peng","orcid":"https://orcid.org/0000-0003-3989-4358"},"institutions":[{"id":"https://openalex.org/I58610484","display_name":"Seattle University","ror":"https://ror.org/02jqc0m91","country_code":"US","type":"education","lineage":["https://openalex.org/I58610484"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yanghua Peng","raw_affiliation_strings":["ByteDance, Seattle, USA"],"raw_orcid":"https://orcid.org/0000-0003-3989-4358","affiliations":[{"raw_affiliation_string":"ByteDance, Seattle, USA","institution_ids":["https://openalex.org/I58610484"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084047386","display_name":"Haibin Lin","orcid":"https://orcid.org/0000-0003-4879-5335"},"institutions":[{"id":"https://openalex.org/I58610484","display_name":"Seattle University","ror":"https://ror.org/02jqc0m91","country_code":"US","type":"education","lineage":["https://openalex.org/I58610484"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haibin Lin","raw_affiliation_strings":["ByteDance, Seattle, USA"],"raw_orcid":"https://orcid.org/0000-0003-4879-5335","affiliations":[{"raw_affiliation_string":"ByteDance, Seattle, USA","institution_ids":["https://openalex.org/I58610484"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086945155","display_name":"Wencong Xiao","orcid":"https://orcid.org/0000-0002-3043-522X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wencong Xiao","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-3043-522X","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113343221","display_name":"Xin Liu","orcid":"https://orcid.org/0009-0000-8346-3323"},"institutions":[{"id":"https://openalex.org/I58610484","display_name":"Seattle University","ror":"https://ror.org/02jqc0m91","country_code":"US","type":"education","lineage":["https://openalex.org/I58610484"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xin Liu","raw_affiliation_strings":["ByteDance, Seattle, USA"],"raw_orcid":"https://orcid.org/0009-0000-8346-3323","affiliations":[{"raw_affiliation_string":"ByteDance, Seattle, USA","institution_ids":["https://openalex.org/I58610484"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5050006104","display_name":"Xiang Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang Xiang","raw_affiliation_strings":["ByteDance, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-2159-4971","affiliations":[{"raw_affiliation_string":"ByteDance, Beijing, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":35,"corresponding_author_ids":["https://openalex.org/A5093967737"],"corresponding_institution_ids":["https://openalex.org/I889458895"],"apc_list":null,"apc_paid":null,"fwci":1.8184,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.88483749,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"186","last_page":"203"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9567000269889832,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9567000269889832,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.9322999715805054,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.7425000071525574},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6851999759674072},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6539999842643738},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.46050000190734863},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.42669999599456787},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4092999994754791}],"concepts":[{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.7425000071525574},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7146999835968018},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6851999759674072},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6539999842643738},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.46050000190734863},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.42669999599456787},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4092999994754791},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.28130000829696655},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2802000045776367},{"id":"https://openalex.org/C175551986","wikidata":"https://www.wikidata.org/wiki/Q47089","display_name":"Fault (geology)","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2775000035762787},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27730000019073486},{"id":"https://openalex.org/C2778583658","wikidata":"https://www.wikidata.org/wiki/Q849415","display_name":"On-the-job training","level":3,"score":0.27649998664855957},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.2572999894618988}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3731569.3764838","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731569.3764838","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3731569.3764838","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731569.3764838","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W2623323969","https://openalex.org/W2734941459","https://openalex.org/W2798515322","https://openalex.org/W2895690683","https://openalex.org/W2969388332","https://openalex.org/W3005780259","https://openalex.org/W3171842021","https://openalex.org/W3204998121","https://openalex.org/W4220741164","https://openalex.org/W4221158240","https://openalex.org/W4224308101","https://openalex.org/W4280611847","https://openalex.org/W4380881139","https://openalex.org/W4386768656","https://openalex.org/W4386840193","https://openalex.org/W4387302750","https://openalex.org/W4387321084","https://openalex.org/W4388662057","https://openalex.org/W4389158499","https://openalex.org/W4390529129","https://openalex.org/W4394745249","https://openalex.org/W4394923484","https://openalex.org/W4395117348","https://openalex.org/W4399511799","https://openalex.org/W4399694321","https://openalex.org/W4400770595","https://openalex.org/W4404400583","https://openalex.org/W4404400739"],"related_works":[],"abstract_inverted_index":{"The":[0],"training":[1,50,55,61,91,100,142,166,175],"scale":[2,32],"of":[3,11,13,23,29,36,92,98,119,143,162],"large":[4],"language":[5],"models":[6],"(LLMs)":[7],"has":[8],"reached":[9],"tens":[10],"thousands":[12],"GPUs":[14,157],"and":[15,66,89,102,108,117,130,140,158],"is":[16,33,147],"still":[17],"continuously":[18],"expanding,":[19],"enabling":[20],"faster":[21],"learning":[22],"larger":[24],"models.":[25],"Accompanying":[26],"the":[27,30,34,96,160,163],"expansion":[28],"resource":[31],"prevalence":[35],"failures":[37,110],"(CUDA":[38],"error,":[39],"NaN":[40],"values,":[41],"job":[42,176],"hang,":[43],"etc.),":[44],"which":[45],"poses":[46],"significant":[47],"challenges":[48],"to":[49,70,106],"stability.":[51],"Any":[52],"large-scale":[53,81],"LLM":[54,99,120,144],"infrastructure":[56,83],"should":[57],"strive":[58],"for":[59,87,172],"minimal":[60],"interruption,":[62],"efficient":[63,73,141],"fault":[64,125,128],"diagnosis,":[65],"effective":[67,134],"failure":[68],"tolerance":[69],"enable":[71],"highly":[72],"continuous":[74,139],"training.":[75],"This":[76],"paper":[77],"presents":[78],"ByteRobust,":[79],"a":[80,112,150,173],"GPU":[82,152],"management":[84],"system":[85],"tailored":[86],"robust":[88],"stable":[90],"LLMs.":[93],"It":[94],"exploits":[95],"uniqueness":[97],"process":[101],"gives":[103],"top":[104],"priorities":[105],"detecting":[107],"recovering":[109],"in":[111,165],"routine":[113],"manner.":[114],"Leveraging":[115],"parallelisms":[116],"characteristics":[118],"training,":[121],"ByteRobust":[122,146],"enables":[123],"high-capacity":[124],"tolerance,":[126],"prompt":[127],"demarcation,":[129],"localization":[131],"with":[132,154],"an":[133],"data-driven":[135],"approach,":[136],"comprehensively":[137],"ensuring":[138],"tasks.":[145],"deployed":[148],"on":[149,177],"production":[151],"platform":[153],"over":[155],"200,000":[156],"advances":[159],"state":[161],"art":[164],"robustness":[167],"by":[168],"achieving":[169],"97%":[170],"ETTR":[171],"three-month":[174],"9,600":[178],"GPUs.":[179]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-10T00:00:00"}
