{"id":"https://openalex.org/W4416203800","doi":"https://doi.org/10.1145/3712285.3759883","title":"Fine-grained Automated Failure Management for Extreme-Scale GPU Accelerated Systems","display_name":"Fine-grained Automated Failure Management for Extreme-Scale GPU Accelerated Systems","publication_year":2025,"publication_date":"2025-11-12","ids":{"openalex":"https://openalex.org/W4416203800","doi":"https://doi.org/10.1145/3712285.3759883"},"language":null,"primary_location":{"id":"doi:10.1145/3712285.3759883","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3712285.3759883","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3712285.3759883","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120597489","display_name":"Yonatan Levitt","orcid":"https://orcid.org/0009-0000-1346-6908"},"institutions":[{"id":"https://openalex.org/I4210104622","display_name":"Intel (Israel)","ror":"https://ror.org/027t2s119","country_code":"IL","type":"company","lineage":["https://openalex.org/I1343180700","https://openalex.org/I4210104622"]}],"countries":["IL"],"is_corresponding":true,"raw_author_name":"Yonatan Levitt","raw_affiliation_strings":["Intel Corporation, Jerusalem, Israel"],"affiliations":[{"raw_affiliation_string":"Intel Corporation, Jerusalem, Israel","institution_ids":["https://openalex.org/I4210104622"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120497143","display_name":"Richard Barella","orcid":"https://orcid.org/0000-0002-2881-3746"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Richard Barella","raw_affiliation_strings":["Intel Corporation, Hillsboro, Oregon, USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation, Hillsboro, Oregon, USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120325300","display_name":"Sam Zeltner","orcid":"https://orcid.org/0009-0003-4347-2462"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sam Zeltner","raw_affiliation_strings":["Intel Corporation, Hillsboro, Oregon, USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation, Hillsboro, Oregon, USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054754082","display_name":"Thomas E. Musta","orcid":"https://orcid.org/0009-0002-4577-4125"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]},{"id":"https://openalex.org/I4210120349","display_name":"University of Minnesota Rochester","ror":"https://ror.org/02rh4fw73","country_code":"US","type":"education","lineage":["https://openalex.org/I4210120349"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Thomas Musta","raw_affiliation_strings":["Intel Corporation, Rochester, Minnesota, USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation, Rochester, Minnesota, USA","institution_ids":["https://openalex.org/I1343180700","https://openalex.org/I4210120349"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048304559","display_name":"Lance Cheney","orcid":"https://orcid.org/0009-0003-4088-2626"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lance Cheney","raw_affiliation_strings":["Intel Corporation, Folsom, California, USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation, Folsom, California, USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015518599","display_name":"Gustavo Espinosa","orcid":"https://orcid.org/0000-0003-0401-7355"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gustavo Espinosa","raw_affiliation_strings":["Intel Corporation, Hillsboro, Oregon, USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation, Hillsboro, Oregon, USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066591464","display_name":"Olivier Franza","orcid":"https://orcid.org/0009-0007-0803-9064"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Olivier Franza","raw_affiliation_strings":["Intel Corporation, Boston, Massechusets, USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation, Boston, Massechusets, USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001838603","display_name":"Balazs Gerofi","orcid":"https://orcid.org/0009-0004-8585-6031"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Balazs Gerofi","raw_affiliation_strings":["Intel Corporation, Hillsboro, Oregon, USA and RIKEN Center for Computational Science (R-CCS), Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Intel Corporation, Hillsboro, Oregon, USA and RIKEN Center for Computational Science (R-CCS), Tokyo, Japan","institution_ids":["https://openalex.org/I1343180700"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5120597489"],"corresponding_institution_ids":["https://openalex.org/I4210104622"],"apc_list":null,"apc_paid":null,"fwci":2.5505,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.91617975,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1073","last_page":"1084"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.25189998745918274,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.25189998745918274,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.09650000184774399,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.08160000294446945,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mean-time-between-failures","display_name":"Mean time between failures","score":0.941100001335144},{"id":"https://openalex.org/keywords/failure-rate","display_name":"Failure rate","score":0.47609999775886536},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.47609999775886536},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.3513000011444092},{"id":"https://openalex.org/keywords/duration","display_name":"Duration (music)","score":0.2937999963760376}],"concepts":[{"id":"https://openalex.org/C44154001","wikidata":"https://www.wikidata.org/wiki/Q754940","display_name":"Mean time between failures","level":3,"score":0.941100001335144},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6115999817848206},{"id":"https://openalex.org/C163164238","wikidata":"https://www.wikidata.org/wiki/Q2737027","display_name":"Failure rate","level":2,"score":0.47609999775886536},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.47609999775886536},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.4722000062465668},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.3513000011444092},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.31060001254081726},{"id":"https://openalex.org/C112758219","wikidata":"https://www.wikidata.org/wiki/Q16038819","display_name":"Duration (music)","level":2,"score":0.2937999963760376},{"id":"https://openalex.org/C193519340","wikidata":"https://www.wikidata.org/wiki/Q891179","display_name":"Data loss","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C23725684","wikidata":"https://www.wikidata.org/wiki/Q616377","display_name":"Maintenance engineering","level":2,"score":0.2531999945640564}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3712285.3759883","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3712285.3759883","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3712285.3759883","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3712285.3759883","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1578105119","https://openalex.org/W1966243865","https://openalex.org/W1980701401","https://openalex.org/W1981432246","https://openalex.org/W1984564341","https://openalex.org/W1986905947","https://openalex.org/W1998105824","https://openalex.org/W1999900893","https://openalex.org/W2033656974","https://openalex.org/W2056218037","https://openalex.org/W2069193766","https://openalex.org/W2133046454","https://openalex.org/W2142812297","https://openalex.org/W2145071552","https://openalex.org/W2146351362","https://openalex.org/W2318507312","https://openalex.org/W2400561398","https://openalex.org/W2767346922","https://openalex.org/W2770542984","https://openalex.org/W2961595369","https://openalex.org/W2969305743","https://openalex.org/W3004697822","https://openalex.org/W3129927603","https://openalex.org/W3148902696","https://openalex.org/W3190774216","https://openalex.org/W4239389894","https://openalex.org/W4280651118","https://openalex.org/W4284964190","https://openalex.org/W4399282312"],"related_works":[],"abstract_inverted_index":{"As":[0,56],"high-performance":[1],"computing":[2],"(HPC)":[3],"systems":[4],"scale":[5],"in":[6],"size,":[7],"system":[8],"wide":[9],"hardware":[10],"failure":[11],"rates":[12],"increase.":[13],"Historical":[14],"data":[15],"from":[16,39],"previous":[17],"large-scale":[18],"HPC":[19],"installations":[20],"illustrate":[21],"this":[22],"trend,":[23],"with":[24],"the":[25,34,68],"mean":[26,59],"time":[27,60],"between":[28],"failures":[29],"(MTBF)":[30],"decreasing":[31],"steadily":[32],"over":[33],"past":[35],"decade.":[36],"Recent":[37],"studies":[38],"artificial":[40],"intelligence":[41],"and":[42],"machine-learning":[43],"(AI/ML)":[44],"training":[45],"extrapolate":[46],"MTBF":[47,57],"declining":[48],"even":[49],"further":[50],"for":[51,70],"future":[52],"GPU":[53],"accelerated":[54],"systems.":[55],"decreases,":[58],"to":[61],"repair":[62],"(MTTR)":[63],"becomes":[64],"more":[65],"pronounced,":[66],"highlighting":[67],"need":[69],"efficient":[71],"recovery":[72],"strategies.":[73]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-12T00:00:00"}
