{"id":"https://openalex.org/W4413788181","doi":"https://doi.org/10.1145/3757347.3759145","title":"Compute-based Fault Tolerance for DNN","display_name":"Compute-based Fault Tolerance for DNN","publication_year":2025,"publication_date":"2025-08-28","ids":{"openalex":"https://openalex.org/W4413788181","doi":"https://doi.org/10.1145/3757347.3759145"},"language":"en","primary_location":{"id":"doi:10.1145/3757347.3759145","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3757347.3759145","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 18th ACM International Systems and Storage Conference","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5028514135","display_name":"Adi Molkho","orcid":"https://orcid.org/0000-0002-7943-3057"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Adi Molkho","raw_affiliation_strings":["Huawei Cloud, Israel"],"raw_orcid":"https://orcid.org/0000-0002-7943-3057","affiliations":[{"raw_affiliation_string":"Huawei Cloud, Israel","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069701905","display_name":"Amit Golander","orcid":"https://orcid.org/0009-0000-6798-6183"},"institutions":[{"id":"https://openalex.org/I16391192","display_name":"Tel Aviv University","ror":"https://ror.org/04mhzgx49","country_code":"IL","type":"education","lineage":["https://openalex.org/I16391192"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Amit Golander","raw_affiliation_strings":["Tel-Aviv University, Israel"],"raw_orcid":"https://orcid.org/0009-0000-6798-6183","affiliations":[{"raw_affiliation_string":"Tel-Aviv University, Israel","institution_ids":["https://openalex.org/I16391192"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5010957839","display_name":"Oded Schwartz","orcid":"https://orcid.org/0000-0003-1309-5566"},"institutions":[{"id":"https://openalex.org/I197251160","display_name":"Hebrew University of Jerusalem","ror":"https://ror.org/03qxff017","country_code":"IL","type":"education","lineage":["https://openalex.org/I197251160"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Oded Schwartz","raw_affiliation_strings":["The Hebrew University, Israel"],"raw_orcid":"https://orcid.org/0000-0003-1309-5566","affiliations":[{"raw_affiliation_string":"The Hebrew University, Israel","institution_ids":["https://openalex.org/I197251160"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5028514135"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.24407655,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"216","last_page":"216"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.9725000262260437,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.9725000262260437,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7733495831489563},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.6869841814041138},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.31303080916404724}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7733495831489563},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.6869841814041138},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.31303080916404724}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3757347.3759145","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3757347.3759145","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 18th ACM International Systems and Storage Conference","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/13","display_name":"Climate action","score":0.7400000095367432}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2144438995","https://openalex.org/W1577886464","https://openalex.org/W2164896586","https://openalex.org/W1593559483","https://openalex.org/W2108334564","https://openalex.org/W2111125783","https://openalex.org/W2100367016"],"abstract_inverted_index":{"Deep":[0],"neural":[1],"network":[2],"(DNN)":[3],"systems":[4],"use":[5],"many":[6],"GPUs,":[7],"which":[8],"can":[9],"fail---making":[10],"fault":[11],"tolerance":[12],"(FT)":[13],"essential":[14],"to":[15],"avoid":[16],"cluster":[17],"restarts.":[18],"Traditional":[19],"FT":[20],"relies":[21],"on":[22],"frequent":[23],"checkpointing,":[24],"incurring":[25],"high":[26],"bandwidth":[27],"and":[28,41,49],"memory":[29],"costs.":[30],"We":[31,45],"propose":[32],"an":[33],"alternative":[34],"strategy":[35],"using":[36],"GPU":[37],"redundancy,":[38],"introducing":[39],"uniform":[40],"heterogeneous":[42],"encoding":[43],"approaches.":[44],"analyze":[46],"their":[47],"costs":[48],"recommend":[50],"usage":[51],"scenarios,":[52],"especially":[53],"for":[54],"emerging":[55],"rack-scale":[56],"AI":[57],"computers":[58]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
