{"id":"https://openalex.org/W4416203828","doi":"https://doi.org/10.1145/3712285.3759893","title":"Exploring and Mitigating Failure Behavior of Large Language Model Training Workloads in HPC Systems","display_name":"Exploring and Mitigating Failure Behavior of Large Language Model Training Workloads in HPC Systems","publication_year":2025,"publication_date":"2025-11-12","ids":{"openalex":"https://openalex.org/W4416203828","doi":"https://doi.org/10.1145/3712285.3759893"},"language":null,"primary_location":{"id":"doi:10.1145/3712285.3759893","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3712285.3759893","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102017383","display_name":"Pengfei Yu","orcid":"https://orcid.org/0000-0002-9926-9442"},"institutions":[{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Pengfei Yu","raw_affiliation_strings":["Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100370034","display_name":"Jingjing Gu","orcid":"https://orcid.org/0000-0002-3989-1520"},"institutions":[{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingjing Gu","raw_affiliation_strings":["Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101682487","display_name":"Hao Han","orcid":"https://orcid.org/0000-0002-7912-027X"},"institutions":[{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Han","raw_affiliation_strings":["Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075007087","display_name":"Dazhong Shen","orcid":"https://orcid.org/0000-0002-3947-4153"},"institutions":[{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dazhong Shen","raw_affiliation_strings":["Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5117243255","display_name":"Bao Wen","orcid":"https://orcid.org/0009-0004-4750-6516"},"institutions":[{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bao Wen","raw_affiliation_strings":["Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5104360912","display_name":"Yang Liu","orcid":"https://orcid.org/0009-0001-2596-8419"},"institutions":[{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Liu","raw_affiliation_strings":["Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5102017383"],"corresponding_institution_ids":["https://openalex.org/I9842412"],"apc_list":null,"apc_paid":null,"fwci":0.6973,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.77655342,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1165","last_page":"1179"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.3343000113964081,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.3343000113964081,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.1354999989271164,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.1225999966263771,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.6474000215530396},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5633999705314636},{"id":"https://openalex.org/keywords/resilience","display_name":"Resilience (materials science)","score":0.5464000105857849},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.5385000109672546},{"id":"https://openalex.org/keywords/fault-injection","display_name":"Fault injection","score":0.5051000118255615},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.4952000081539154},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4674000144004822},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.46540001034736633},{"id":"https://openalex.org/keywords/fault-model","display_name":"Fault model","score":0.462799996137619}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7407000064849854},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.6474000215530396},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5633999705314636},{"id":"https://openalex.org/C2779585090","wikidata":"https://www.wikidata.org/wiki/Q3457762","display_name":"Resilience (materials science)","level":2,"score":0.5464000105857849},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.5385000109672546},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.5097000002861023},{"id":"https://openalex.org/C2775928411","wikidata":"https://www.wikidata.org/wiki/Q2041312","display_name":"Fault injection","level":3,"score":0.5051000118255615},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.4952000081539154},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4674000144004822},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.46540001034736633},{"id":"https://openalex.org/C167391956","wikidata":"https://www.wikidata.org/wiki/Q1401211","display_name":"Fault model","level":3,"score":0.462799996137619},{"id":"https://openalex.org/C175551986","wikidata":"https://www.wikidata.org/wiki/Q47089","display_name":"Fault (geology)","level":2,"score":0.40790000557899475},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.3953000009059906},{"id":"https://openalex.org/C152745839","wikidata":"https://www.wikidata.org/wiki/Q5438153","display_name":"Fault detection and isolation","level":3,"score":0.37779998779296875},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3686000108718872},{"id":"https://openalex.org/C50712370","wikidata":"https://www.wikidata.org/wiki/Q4269346","display_name":"Software fault tolerance","level":3,"score":0.36059999465942383},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3564999997615814},{"id":"https://openalex.org/C2780799671","wikidata":"https://www.wikidata.org/wiki/Q17087362","display_name":"Transient (computer programming)","level":2,"score":0.35429999232292175},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.34540000557899475},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3334999978542328},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33180001378059387},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3222000002861023},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3165999948978424},{"id":"https://openalex.org/C112987892","wikidata":"https://www.wikidata.org/wiki/Q5051574","display_name":"Catastrophic failure","level":2,"score":0.3165999948978424},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.3151000142097473},{"id":"https://openalex.org/C126953365","wikidata":"https://www.wikidata.org/wiki/Q5438152","display_name":"Fault coverage","level":3,"score":0.29510000348091125},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.2712000012397766},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.26899999380111694}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3712285.3759893","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3712285.3759893","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5418828534","display_name":null,"funder_award_id":"62406141","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":51,"referenced_works":["https://openalex.org/W1619226191","https://openalex.org/W1966243865","https://openalex.org/W2100142058","https://openalex.org/W2120860555","https://openalex.org/W2168234447","https://openalex.org/W2318507312","https://openalex.org/W2522720996","https://openalex.org/W2767260595","https://openalex.org/W2773474514","https://openalex.org/W2798993323","https://openalex.org/W2799640043","https://openalex.org/W2809188712","https://openalex.org/W2923014074","https://openalex.org/W2963015836","https://openalex.org/W2990200213","https://openalex.org/W3000442532","https://openalex.org/W3022067362","https://openalex.org/W3046764219","https://openalex.org/W3111013705","https://openalex.org/W3129298663","https://openalex.org/W3135464648","https://openalex.org/W3135774742","https://openalex.org/W3178469298","https://openalex.org/W3187552919","https://openalex.org/W3188924112","https://openalex.org/W3204998121","https://openalex.org/W3211099997","https://openalex.org/W3215256528","https://openalex.org/W4280598352","https://openalex.org/W4285820867","https://openalex.org/W4309857285","https://openalex.org/W4380881139","https://openalex.org/W4388031367","https://openalex.org/W4388661953","https://openalex.org/W4388662069","https://openalex.org/W4395001560","https://openalex.org/W4395106422","https://openalex.org/W4396835373","https://openalex.org/W4400034224","https://openalex.org/W4400123707","https://openalex.org/W4400410938","https://openalex.org/W4403063482","https://openalex.org/W4403390857","https://openalex.org/W4405755284","https://openalex.org/W4406014313","https://openalex.org/W4406164090","https://openalex.org/W4407219227","https://openalex.org/W4408891420","https://openalex.org/W4409116517","https://openalex.org/W4409149346","https://openalex.org/W4415796026"],"related_works":[],"abstract_inverted_index":{"The":[0],"exponential":[1],"growth":[2],"of":[3,131],"Large":[4],"Language":[5],"Model":[6],"(LLM)":[7],"training":[8,38,85,147],"demands":[9],"in":[10,25,36,134],"HPC":[11],"systems":[12],"has":[13],"exposed":[14],"critical":[15,89],"reliability":[16],"challenges,":[17,47],"particularly":[18],"from":[19],"transient":[20],"faults.":[21],"Unlike":[22],"resilience":[23],"studies":[24],"conventional":[26],"DNN":[27],"inference,":[28],"the":[29,84],"massive":[30],"parameter":[31],"scale":[32],"and":[33,56,119],"iterative":[34],"updates":[35],"LLM":[37,146],"trigger":[39],"more":[40],"complex":[41],"failure":[42,60],"patterns.":[43],"To":[44],"address":[45],"these":[46],"we":[48,99],"introduce":[49],"LLMFI,":[50],"a":[51,88,102],"new":[52],"fault":[53,64,105,117,135],"injection":[54,65],"tool,":[55],"reveal":[57],"six":[58],"distinct":[59],"behaviors":[61],"through":[62],"300K+":[63],"experiments":[66],"(exceeding":[67],"5K":[68],"GPU":[69,141],"node-hours).":[70],"Our":[71],"key":[72],"insight":[73],"is":[74],"that,":[75],"while":[76],"most":[77],"injected":[78],"faults":[79],"are":[80],"eventually":[81],"masked":[82],"by":[83],"iteration":[86],"mechanism,":[87],"subset":[90],"leads":[91],"to":[92],"catastrophic":[93],"failures":[94],"or":[95],"performance":[96],"degradation.":[97],"Further,":[98],"propose":[100],"LLMFT,":[101],"novel":[103],"machine-learning-based":[104],"tolerance":[106],"framework":[107],"that":[108,126],"implements":[109],"closed-loop":[110],"error":[111],"control":[112],"via":[113],"heuristic":[114],"feature":[115],"extraction,":[116],"detector,":[118],"dual":[120],"recovery":[121],"mechanisms.":[122],"Extensive":[123],"evaluation":[124],"demonstrates":[125],"LLMFT":[127],"achieves":[128],"an":[129],"average":[130],"97.61%":[132],"F1-score":[133],"detection":[136],"with":[137],"only":[138],"0.01%\u20130.05%":[139],"additional":[140],"memory":[142],"overhead,":[143],"effectively":[144],"mitigating":[145],"failures.":[148]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-11-12T00:00:00"}
