{"id":"https://openalex.org/W4409248832","doi":"https://doi.org/10.1109/hpca61900.2025.00096","title":"Revisiting Reliability in Large-Scale Machine Learning Research Clusters","display_name":"Revisiting Reliability in Large-Scale Machine Learning Research Clusters","publication_year":2025,"publication_date":"2025-03-01","ids":{"openalex":"https://openalex.org/W4409248832","doi":"https://doi.org/10.1109/hpca61900.2025.00096"},"language":"en","primary_location":{"id":"doi:10.1109/hpca61900.2025.00096","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca61900.2025.00096","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114633502","display_name":"Apostolos Kokolis","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Apostolos Kokolis","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002457261","display_name":"Michael Kuchnik","orcid":"https://orcid.org/0000-0002-0805-1828"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Michael Kuchnik","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109346484","display_name":"John P. Hoffman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"John Hoffman","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080055003","display_name":"Adithya Kumar","orcid":"https://orcid.org/0000-0002-5614-8878"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Adithya Kumar","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026896981","display_name":"Parth Malani","orcid":"https://orcid.org/0009-0001-0589-5048"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Parth Malani","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111739530","display_name":"Famai Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Faye Ma","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008830900","display_name":"Zachary DeVito","orcid":"https://orcid.org/0009-0002-8863-1503"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zachary DeVito","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003509810","display_name":"Shubho Sengupta","orcid":"https://orcid.org/0009-0005-0818-0979"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shubho Sengupta","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092506186","display_name":"Kalyan Saladi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kalyan Saladi","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5028220093","display_name":"Carole-Jean Wu","orcid":"https://orcid.org/0000-0002-9032-7239"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Carole-Jean Wu","raw_affiliation_strings":["FAIR at Meta"],"affiliations":[{"raw_affiliation_string":"FAIR at Meta","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5114633502"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":26.0791,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.99431858,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1259","last_page":"1274"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.6906999945640564,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.6906999945640564,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11891","display_name":"Big Data and Business Intelligence","score":0.6593999862670898,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11396","display_name":"Artificial Intelligence in Healthcare","score":0.5788999795913696,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.7016573548316956},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6328473687171936},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.617591917514801},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4598390460014343},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4375758767127991},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.13144972920417786},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.07550027966499329}],"concepts":[{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.7016573548316956},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6328473687171936},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.617591917514801},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4598390460014343},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4375758767127991},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.13144972920417786},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.07550027966499329},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca61900.2025.00096","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca61900.2025.00096","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.4099999964237213,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":46,"referenced_works":["https://openalex.org/W1596936080","https://openalex.org/W1999900893","https://openalex.org/W2033656974","https://openalex.org/W2045271686","https://openalex.org/W2129542763","https://openalex.org/W2133046454","https://openalex.org/W2141992894","https://openalex.org/W2606722458","https://openalex.org/W2902410312","https://openalex.org/W2903241786","https://openalex.org/W4280651118","https://openalex.org/W4307424368","https://openalex.org/W4380874786","https://openalex.org/W4380881139","https://openalex.org/W4386117806","https://openalex.org/W4387302750","https://openalex.org/W4390874575","https://openalex.org/W4394998532","https://openalex.org/W4395106422","https://openalex.org/W4400786105","https://openalex.org/W4401176799","https://openalex.org/W4401211883","https://openalex.org/W4402389838","https://openalex.org/W6628377381","https://openalex.org/W6660700592","https://openalex.org/W6680402377","https://openalex.org/W6684859321","https://openalex.org/W6758283263","https://openalex.org/W6767997687","https://openalex.org/W6784348005","https://openalex.org/W6784593087","https://openalex.org/W6790814326","https://openalex.org/W6810081322","https://openalex.org/W6810475161","https://openalex.org/W6838632916","https://openalex.org/W6839734537","https://openalex.org/W6850625674","https://openalex.org/W6856858123","https://openalex.org/W6858846850","https://openalex.org/W6859583170","https://openalex.org/W6860016859","https://openalex.org/W6860041859","https://openalex.org/W6861885378","https://openalex.org/W6862033441","https://openalex.org/W6871040028","https://openalex.org/W6874271334"],"related_works":["https://openalex.org/W2961085424","https://openalex.org/W4306674287","https://openalex.org/W4387369504","https://openalex.org/W4394896187","https://openalex.org/W3170094116","https://openalex.org/W4386462264","https://openalex.org/W3107602296","https://openalex.org/W4364306694","https://openalex.org/W4312192474","https://openalex.org/W4283697347"],"abstract_inverted_index":{"Reliability":[0],"is":[1],"a":[2,45,126,161,177,181,189],"fundamental":[3],"challenge":[4],"in":[5,63,91],"operating":[6],"large-scale":[7],"machine":[8],"learning":[9],"(ML)":[10],"infrastructures,":[11],"particularly":[12],"as":[13,188],"the":[14,32,87,92,117,201,220,227],"scale":[15],"of":[16,27,34,47,89,119,128,137,191,203,222],"ML":[17,52,120,142],"models":[18],"and":[19,59,65,94,110,130,148,194,214,232,237],"training":[20,121],"clusters":[21,93],"continues":[22],"to":[23,81,164,168,179,199],"grow.":[24],"Despite":[25],"decades":[26],"research":[28,216],"on":[29,156],"infrastructure":[30],"failures,":[31,82],"impact":[33],"job":[35,192],"failures":[36,129],"across":[37,108],"different":[38],"scales":[39],"remains":[40],"unclear.":[41],"This":[42],"paper":[43],"presents":[44],"view":[46],"managing":[48],"two":[49,140],"large,":[50],"multi-tenant":[51],"clusters,":[53,109,225],"providing":[54],"quantitative":[55],"analysis,":[56],"operational":[57],"experience,":[58],"our":[60,157],"own":[61],"perspective":[62],"understanding":[64],"addressing":[66],"reliability":[67,113,132,221],"concerns":[68],"at":[69,122,207],"scale.":[70,208],"Our":[71,209],"analysis":[72],"reveals":[73],"that":[74],"while":[75],"large":[76],"jobs":[77,84,90,147],"are":[78],"most":[79],"vulnerable":[80],"smaller":[83],"make":[85],"up":[86],"majority":[88],"should":[95],"be":[96],"incorporated":[97],"into":[98],"optimization":[99],"objectives.":[100],"We":[101,174],"identify":[102],"key":[103,131],"workload":[104],"properties,":[105],"compare":[106],"them":[107],"demonstrate":[111],"essential":[112],"requirements":[114],"for":[115,170,218,229],"pushing":[116],"boundaries":[118],"scale.We":[123],"hereby":[124],"introduce":[125],"taxonomy":[127],"metrics,":[133],"analyze":[134],"11":[135],"months":[136],"data":[138],"from":[139],"state-of-the-art":[141],"environments":[143],"with":[144],"4":[145],"million":[146,151],"over":[149],"150":[150],"A100":[152],"GPU":[153,172],"hours.":[154],"Building":[155],"data,":[158],"we":[159,195],"fit":[160],"failure":[162],"model":[163,198],"project":[165],"Mean":[166],"Time":[167,186],"Failure":[169],"various":[171],"scales.":[173],"further":[175],"propose":[176],"method":[178],"estimate":[180],"related":[182],"metric,":[183],"Effective":[184],"Training":[185],"Ratio,":[187],"function":[190],"parameters,":[193],"use":[196],"this":[197],"gauge":[200],"efficacy":[202],"potential":[204],"software":[205],"mitigations":[206],"work":[210],"provides":[211],"valuable":[212],"insights":[213],"future":[215],"directions":[217],"improving":[219],"AI":[223],"supercomputer":[224],"emphasizing":[226],"need":[228],"flexible,":[230],"workload-agnostic,":[231],"reliability-aware":[233],"infrastructure,":[234],"system":[235],"software,":[236],"algorithms.":[238]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":9}],"updated_date":"2026-04-24T08:23:43.765630","created_date":"2025-10-10T00:00:00"}
