{"id":"https://openalex.org/W7160914352","doi":"https://doi.org/10.48550/arxiv.2605.09370","title":"From Detection to Recovery: Operational Analysis on LLM Pre-training with 504 GPUs","display_name":"From Detection to Recovery: Operational Analysis on LLM Pre-training with 504 GPUs","publication_year":2026,"publication_date":"2026-05-10","ids":{"openalex":"https://openalex.org/W7160914352","doi":"https://doi.org/10.48550/arxiv.2605.09370"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.09370","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09370","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.09370","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025441207","display_name":"Daemyung Kang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang, Daemyung","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135926292","display_name":"Eunjin Hwang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hwang, Eunjin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135924189","display_name":"Hanjeong Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Hanjeong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135997565","display_name":"HyeokJin Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, HyeokJin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135979851","display_name":"Hyunhoi Koo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koo, Hyunhoi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087110191","display_name":"Jeongkyu Shin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shin, Jeongkyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135994602","display_name":"Jeongseok Kang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang, Jeongseok","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135951803","display_name":"Jihyun Kang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang, Jihyun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022640515","display_name":"Joongi Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heo, Jinho","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135962864","display_name":"Junbum Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Joongi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135917993","display_name":"Jungseung Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Junbum","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001880649","display_name":"kyujin Cho","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jungseung","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135969748","display_name":"Youngsook Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cho, Kyujin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Song, Youngsook","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Youngsook","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.8003000020980835,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.8003000020980835,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.034699998795986176,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.01549999974668026,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.7350000143051147},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.6814000010490417},{"id":"https://openalex.org/keywords/false-positive-paradox","display_name":"False positive paradox","score":0.5863000154495239},{"id":"https://openalex.org/keywords/profiling","display_name":"Profiling (computer programming)","score":0.4350000023841858},{"id":"https://openalex.org/keywords/cluster","display_name":"Cluster (spacecraft)","score":0.38179999589920044},{"id":"https://openalex.org/keywords/distributed-database","display_name":"Distributed database","score":0.34200000762939453},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.3416000008583069},{"id":"https://openalex.org/keywords/failure-rate","display_name":"Failure rate","score":0.33820000290870667}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7612000107765198},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.7350000143051147},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.6814000010490417},{"id":"https://openalex.org/C64869954","wikidata":"https://www.wikidata.org/wiki/Q1859747","display_name":"False positive paradox","level":2,"score":0.5863000154495239},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.4821000099182129},{"id":"https://openalex.org/C187191949","wikidata":"https://www.wikidata.org/wiki/Q1138496","display_name":"Profiling (computer programming)","level":2,"score":0.4350000023841858},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.38179999589920044},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.38119998574256897},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.35359999537467957},{"id":"https://openalex.org/C70061542","wikidata":"https://www.wikidata.org/wiki/Q989016","display_name":"Distributed database","level":2,"score":0.34200000762939453},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.3416000008583069},{"id":"https://openalex.org/C163164238","wikidata":"https://www.wikidata.org/wiki/Q2737027","display_name":"Failure rate","level":2,"score":0.33820000290870667},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.30790001153945923},{"id":"https://openalex.org/C115874739","wikidata":"https://www.wikidata.org/wiki/Q825377","display_name":"Critical path method","level":2,"score":0.30709999799728394},{"id":"https://openalex.org/C29140674","wikidata":"https://www.wikidata.org/wiki/Q206637","display_name":"Computer cluster","level":2,"score":0.3046000003814697},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.27880001068115234},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.27880001068115234},{"id":"https://openalex.org/C2778067643","wikidata":"https://www.wikidata.org/wiki/Q166507","display_name":"Interval (graph theory)","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2728999853134155},{"id":"https://openalex.org/C2780940931","wikidata":"https://www.wikidata.org/wiki/Q174989","display_name":"File system","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C108713360","wikidata":"https://www.wikidata.org/wiki/Q1824206","display_name":"Supply chain","level":2,"score":0.26429998874664307},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2621000111103058},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.2612999975681305},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.26080000400543213}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.09370","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09370","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.09370","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09370","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.6518976092338562,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large-scale":[0],"AI":[1],"training":[2,26,61],"is":[3,65,125],"fundamentally":[4],"a":[5,37,77,86,95,203,218],"distributed":[6],"systems":[7],"problem,":[8],"where":[9],"hardware":[10],"failures":[11],"are":[12,229],"routine":[13],"operating":[14],"conditions":[15],"rather":[16],"than":[17],"rare":[18],"exceptions,":[19],"yet":[20],"public":[21],"operational":[22,56],"evidence":[23],"from":[24,142],"production":[25,41,232],"clusters":[27],"remains":[28],"limited.":[29],"This":[30,81],"report":[31],"presents":[32],"an":[33],"empirical":[34],"analysis":[35,201],"of":[36,48,55,85,153,163,191,222],"63-node":[38],"NVIDIA":[39,72],"B200":[40],"cluster":[42],"(504":[43],"GPUs),":[44],"using":[45],"55":[46],"days":[47,54],"Prometheus":[49,115],"time-series":[50],"data":[51],"and":[52,117,159,172,240],"73":[53,182],"logs":[57],"covering":[58],"224":[59,179],"multi-node":[60],"sessions.":[62],"The":[63],"environment":[64],"cross-organizational:":[66],"five":[67],"parties":[68],"(SKT,":[69],"Upstage,":[70],"Lablup,":[71],"Korea,":[73],"VAST":[74],"Data)":[75],"share":[76],"unified":[78,241],"monitoring":[79],"pipeline.":[80],"enabled":[82],"joint":[83],"diagnosis":[84],"60-node-scale":[87],"storage":[88],"I/O":[89],"bottleneck":[90],"absent":[91],"in":[92,231],"2-4-node":[93],"tests,":[94],"production-scale":[96],"phenomenon":[97],"no":[98,122],"single":[99,123],"team":[100],"could":[101],"isolate":[102],"alone.":[103],"We":[104],"perform":[105],"three":[106],"quantitative":[107],"analyses":[108,228],"yielding":[109],"four":[110],"findings.":[111],"First,":[112],"over":[113,181,196,207],"751":[114],"metrics":[116],"10":[118],"XID-identified":[119],"GPU":[120,143],"failures,":[121],"metric":[124],"consistently":[126],"dominant":[127],"across":[128,178],"failure":[129],"types,":[130],"motivating":[131],"multi-signal":[132],"detection.":[133],"Second,":[134],"523":[135],"checkpoint":[136],"events":[137],"trace":[138],"the":[139,146,188,213],"save/load":[140],"path":[141],"VRAM":[144],"to":[145],"NFS":[147],"server:":[148],"restart":[149],"loading":[150],"reaches":[151],"21.5%":[152],"maximum":[154,164],"read":[155],"bandwidth":[156,166],"(700":[157],"GB/s)":[158],"save":[160],"bursts":[161],"16.0%":[162],"write":[165],"(250":[167],"GB/s),":[168],"with":[169,217],"NFS/RPC":[170],"queueing":[171],"transport-layer":[173],"backlog":[174],"rising":[175],"together.":[176],"Third,":[177],"sessions":[180],"days,":[183],"node":[184],"exclusions":[185],"concentrate":[186],"so":[187],"top":[189],"3":[190],"63":[192],"nodes":[193],"account":[194],"for":[195],"50%.":[197],"Fourth,":[198],"auto-retry":[199],"chain":[200],"shows":[202],"33.3%":[204],"success":[205],"rate":[206],"12":[208],"chains":[209],"(73":[210],"attempts),":[211],"2.7x":[212],"12.5%":[214],"manual":[215],"rate,":[216],"median":[219],"retry":[220],"interval":[221],"11":[223],"minutes":[224],"(IQR":[225],"10-11).":[226],"All":[227],"grounded":[230],"infrastructure":[233],"providing":[234],"session-level":[235],"workload":[236],"management,":[237],"GPU-centric":[238],"scheduling,":[239],"observability.":[242]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
