{"id":"https://openalex.org/W4380881139","doi":"https://doi.org/10.1145/3579371.3589105","title":"Understanding and Mitigating Hardware Failures in Deep Learning Training Systems","display_name":"Understanding and Mitigating Hardware Failures in Deep Learning Training Systems","publication_year":2023,"publication_date":"2023-06-16","ids":{"openalex":"https://openalex.org/W4380881139","doi":"https://doi.org/10.1145/3579371.3589105"},"language":"en","primary_location":{"id":"doi:10.1145/3579371.3589105","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3579371.3589105","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3579371.3589105","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 50th Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3579371.3589105","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100702362","display_name":"Yi He","orcid":"https://orcid.org/0000-0001-7206-4845"},"institutions":[{"id":"https://openalex.org/I40347166","display_name":"University of Chicago","ror":"https://ror.org/024mw5h28","country_code":"US","type":"education","lineage":["https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yi He","raw_affiliation_strings":["University of Chicago, Chicago, USA"],"raw_orcid":"https://orcid.org/0000-0001-7206-4845","affiliations":[{"raw_affiliation_string":"University of Chicago, Chicago, USA","institution_ids":["https://openalex.org/I40347166"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038450823","display_name":"Mike Hutton","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mike Hutton","raw_affiliation_strings":["Google, Sunnyvale, USA"],"raw_orcid":"https://orcid.org/0009-0009-0139-7242","affiliations":[{"raw_affiliation_string":"Google, Sunnyvale, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103220002","display_name":"Steven H. Chan","orcid":"https://orcid.org/0009-0003-5722-3467"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Steven Chan","raw_affiliation_strings":["Google, Sunnyvale, USA"],"raw_orcid":"https://orcid.org/0009-0003-5722-3467","affiliations":[{"raw_affiliation_string":"Google, Sunnyvale, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092183284","display_name":"Robert De Gruijl","orcid":"https://orcid.org/0009-0004-7911-6213"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Robert De Gruijl","raw_affiliation_strings":["Google, Sunnyvale, USA"],"raw_orcid":"https://orcid.org/0009-0004-7911-6213","affiliations":[{"raw_affiliation_string":"Google, Sunnyvale, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001964650","display_name":"Rama Govindaraju","orcid":"https://orcid.org/0009-0008-3783-7150"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rama Govindaraju","raw_affiliation_strings":["Google, Sunnyvale, USA"],"raw_orcid":"https://orcid.org/0009-0008-3783-7150","affiliations":[{"raw_affiliation_string":"Google, Sunnyvale, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037285672","display_name":"Nishant Patil","orcid":"https://orcid.org/0000-0001-6620-0038"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nishant Patil","raw_affiliation_strings":["Google, Sunnyvale, USA"],"raw_orcid":"https://orcid.org/0000-0001-6620-0038","affiliations":[{"raw_affiliation_string":"Google, Sunnyvale, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101989053","display_name":"Yanjing Li","orcid":"https://orcid.org/0000-0003-0124-0463"},"institutions":[{"id":"https://openalex.org/I40347166","display_name":"University of Chicago","ror":"https://ror.org/024mw5h28","country_code":"US","type":"education","lineage":["https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yanjing Li","raw_affiliation_strings":["University of Chicago, Chicago, USA"],"raw_orcid":"https://orcid.org/0000-0003-0124-0463","affiliations":[{"raw_affiliation_string":"University of Chicago, Chicago, USA","institution_ids":["https://openalex.org/I40347166"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100702362"],"corresponding_institution_ids":["https://openalex.org/I40347166"],"apc_list":null,"apc_paid":null,"fwci":5.8015,"has_fulltext":true,"cited_by_count":47,"citation_normalized_percentile":{"value":0.96870428,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"16"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11032","display_name":"VLSI and Analog Circuit Testing","score":0.9878000020980835,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.710595965385437},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6785663962364197},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.6122034788131714},{"id":"https://openalex.org/keywords/deep-neural-networks","display_name":"Deep neural networks","score":0.5347933173179626},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.5164470672607422},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3848353922367096},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3761320114135742},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.3494362533092499}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.710595965385437},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6785663962364197},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.6122034788131714},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.5347933173179626},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.5164470672607422},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3848353922367096},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3761320114135742},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3494362533092499},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3579371.3589105","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3579371.3589105","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3579371.3589105","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 50th Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3579371.3589105","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3579371.3589105","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3579371.3589105","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 50th Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320309626","display_name":"University of Chicago","ror":"https://ror.org/024mw5h28"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4380881139.pdf","grobid_xml":"https://content.openalex.org/works/W4380881139.grobid-xml"},"referenced_works_count":49,"referenced_works":["https://openalex.org/W1981504678","https://openalex.org/W2067717709","https://openalex.org/W2078668570","https://openalex.org/W2115081151","https://openalex.org/W2130189691","https://openalex.org/W2169596872","https://openalex.org/W2257408573","https://openalex.org/W2337485678","https://openalex.org/W2626574314","https://openalex.org/W2794670651","https://openalex.org/W2798273231","https://openalex.org/W2902203584","https://openalex.org/W2914237411","https://openalex.org/W2915262043","https://openalex.org/W2946682676","https://openalex.org/W2969553121","https://openalex.org/W2985374077","https://openalex.org/W2989569745","https://openalex.org/W3004127905","https://openalex.org/W3005560345","https://openalex.org/W3008928377","https://openalex.org/W3014034225","https://openalex.org/W3037261120","https://openalex.org/W3090586977","https://openalex.org/W3096295711","https://openalex.org/W3105084904","https://openalex.org/W3120931476","https://openalex.org/W3135074210","https://openalex.org/W3159836144","https://openalex.org/W3161596461","https://openalex.org/W3171842021","https://openalex.org/W3189398641","https://openalex.org/W3190062760","https://openalex.org/W4206336135","https://openalex.org/W4211189626","https://openalex.org/W4220837296","https://openalex.org/W4226335648","https://openalex.org/W4230841294","https://openalex.org/W4232187275","https://openalex.org/W4232418947","https://openalex.org/W4238194638","https://openalex.org/W4242472899","https://openalex.org/W4245199738","https://openalex.org/W4250541405","https://openalex.org/W4251754000","https://openalex.org/W4253362779","https://openalex.org/W4255032011","https://openalex.org/W4287822963","https://openalex.org/W4302344748"],"related_works":["https://openalex.org/W230091440","https://openalex.org/W2233261550","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W2997094352","https://openalex.org/W4375867731","https://openalex.org/W4377865163","https://openalex.org/W3193857078","https://openalex.org/W2888956734","https://openalex.org/W3208304128"],"abstract_inverted_index":{"Deep":[0],"neural":[1],"network":[2],"(DNN)":[3],"training":[4,26,60],"workloads":[5],"are":[6],"increasingly":[7],"susceptible":[8],"to":[9,20,29],"hardware":[10,30,55],"failures":[11,31,56],"in":[12,23],"datacenters.":[13],"For":[14],"example,":[15],"Google":[16],"experienced":[17],"\"mysterious,":[18],"difficult":[19],"identify":[21],"problems\"":[22],"their":[24],"TPU":[25],"systems":[27],"due":[28],"[7].":[32],"Although":[33],"these":[34],"particular":[35],"problems":[36],"were":[37],"subsequently":[38],"corrected":[39],"through":[40],"significant":[41],"efforts,":[42],"they":[43],"have":[44],"raised":[45],"the":[46,50],"urgency":[47],"of":[48],"addressing":[49],"growing":[51],"challenges":[52],"emerging":[53],"from":[54],"impacting":[57],"many":[58],"DNN":[59],"workloads.":[61]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":29},{"year":2024,"cited_by_count":13}],"updated_date":"2026-06-04T09:04:59.091469","created_date":"2025-10-10T00:00:00"}
