{"id":"https://openalex.org/W4416203804","doi":"https://doi.org/10.1145/3712285.3759891","title":"LowDiff: Efficient Frequent Checkpointing via Low-Cost Differential for High-Performance Distributed Training Systems","display_name":"LowDiff: Efficient Frequent Checkpointing via Low-Cost Differential for High-Performance Distributed Training Systems","publication_year":2025,"publication_date":"2025-11-12","ids":{"openalex":"https://openalex.org/W4416203804","doi":"https://doi.org/10.1145/3712285.3759891"},"language":null,"primary_location":{"id":"doi:10.1145/3712285.3759891","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3712285.3759891","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100958327","display_name":"Chenxuan Yao","orcid":"https://orcid.org/0000-0002-1143-052X"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chenxuan Yao","raw_affiliation_strings":["Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034856110","display_name":"Feifan Liu","orcid":"https://orcid.org/0009-0007-9790-7651"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feifan Liu","raw_affiliation_strings":["Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065831190","display_name":"Yuchong Hu","orcid":"https://orcid.org/0000-0003-1265-7141"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuchong Hu","raw_affiliation_strings":["Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053797000","display_name":"Zhengyu Liu","orcid":"https://orcid.org/0009-0003-2432-0092"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhengyu Liu","raw_affiliation_strings":["Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100882939","display_name":"Xin Zheng","orcid":"https://orcid.org/0009-0007-7321-9264"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinjue Zheng","raw_affiliation_strings":["Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112600223","display_name":"Wenxiang Zhou","orcid":"https://orcid.org/0009-0007-6391-7798"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenxiang Zhou","raw_affiliation_strings":["Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100958327"],"corresponding_institution_ids":["https://openalex.org/I47720641"],"apc_list":null,"apc_paid":null,"fwci":1.4888,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.8836951,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1113","last_page":"1126"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.578000009059906,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.578000009059906,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.04360000044107437,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.029500000178813934,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.7045999765396118},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.6503999829292297},{"id":"https://openalex.org/keywords/differential","display_name":"Differential (mechanical device)","score":0.46209999918937683},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3813000023365021},{"id":"https://openalex.org/keywords/distributed-algorithm","display_name":"Distributed algorithm","score":0.36340001225471497},{"id":"https://openalex.org/keywords/distributed-learning","display_name":"Distributed learning","score":0.3407999873161316}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.760699987411499},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.7045999765396118},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.6503999829292297},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.609000027179718},{"id":"https://openalex.org/C93226319","wikidata":"https://www.wikidata.org/wiki/Q193137","display_name":"Differential (mechanical device)","level":2,"score":0.46209999918937683},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3813000023365021},{"id":"https://openalex.org/C130120984","wikidata":"https://www.wikidata.org/wiki/Q2835898","display_name":"Distributed algorithm","level":2,"score":0.36340001225471497},{"id":"https://openalex.org/C2779582901","wikidata":"https://www.wikidata.org/wiki/Q21013010","display_name":"Distributed learning","level":2,"score":0.3407999873161316},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.3366999924182892},{"id":"https://openalex.org/C70061542","wikidata":"https://www.wikidata.org/wiki/Q989016","display_name":"Distributed database","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2858000099658966},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27250000834465027},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2538999915122986},{"id":"https://openalex.org/C3739613","wikidata":"https://www.wikidata.org/wiki/Q679003","display_name":"Distributed Computing Environment","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3712285.3759891","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3712285.3759891","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W2060393849","https://openalex.org/W2108598243","https://openalex.org/W2194775991","https://openalex.org/W2886288284","https://openalex.org/W2982475424","https://openalex.org/W2985108934","https://openalex.org/W3005780259","https://openalex.org/W3081168214","https://openalex.org/W3091097978","https://openalex.org/W3129831491","https://openalex.org/W3206867815","https://openalex.org/W4221008227","https://openalex.org/W4318541692","https://openalex.org/W4386768656","https://openalex.org/W4387302750","https://openalex.org/W4387544273","https://openalex.org/W4388442649","https://openalex.org/W4390097689","https://openalex.org/W4394923484","https://openalex.org/W4399757647","https://openalex.org/W4402048013","https://openalex.org/W4407216788"],"related_works":[],"abstract_inverted_index":{"Distributed":[0],"training":[1,39,63],"of":[2],"large":[3],"deep-learning":[4],"models":[5],"often":[6],"leads":[7],"to":[8,47,54,60],"failures,":[9],"so":[10,57],"checkpointing":[11,22,43],"is":[12,52],"commonly":[13],"employed":[14],"for":[15,23],"recovery.":[16],"State-of-the-art":[17],"studies":[18],"focus":[19],"on":[20],"frequent":[21],"fast":[24],"recovery":[25],"from":[26],"failures.":[27],"However,":[28],"it":[29,51],"generates":[30],"numerous":[31],"checkpoints,":[32],"incurring":[33],"substantial":[34],"costs":[35],"and":[36],"thus":[37],"degrading":[38],"performance.":[40],"Recently,":[41],"differential":[42],"has":[44],"been":[45],"proposed":[46],"reduce":[48],"costs,":[49],"but":[50],"limited":[53],"recommendation":[55],"systems,":[56],"its":[58],"application":[59],"general":[61],"distributed":[62],"systems":[64],"remains":[65],"unexplored.":[66]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-11-12T00:00:00"}
