{"id":"https://openalex.org/W4312892127","doi":"https://doi.org/10.1145/3522664.3528605","title":"Checkpointing and deterministic training for deep learning","display_name":"Checkpointing and deterministic training for deep learning","publication_year":2022,"publication_date":"2022-05-16","ids":{"openalex":"https://openalex.org/W4312892127","doi":"https://doi.org/10.1145/3522664.3528605"},"language":"en","primary_location":{"id":"doi:10.1145/3522664.3528605","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3522664.3528605","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3522664.3528605","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 1st International Conference on AI Engineering: Software Engineering for AI","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3522664.3528605","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036121611","display_name":"Xiangzhe Xu","orcid":"https://orcid.org/0000-0001-6619-781X"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Xiangzhe Xu","raw_affiliation_strings":["Purdue University"],"affiliations":[{"raw_affiliation_string":"Purdue University","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100773324","display_name":"Hongyu Liu","orcid":"https://orcid.org/0000-0002-2281-019X"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hongyu Liu","raw_affiliation_strings":["Purdue University"],"affiliations":[{"raw_affiliation_string":"Purdue University","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014842586","display_name":"Guanhong Tao","orcid":"https://orcid.org/0000-0002-4701-1327"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Guanhong Tao","raw_affiliation_strings":["Purdue University"],"affiliations":[{"raw_affiliation_string":"Purdue University","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039720331","display_name":"Xuan Zhou","orcid":"https://orcid.org/0000-0001-6591-8515"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhou Xuan","raw_affiliation_strings":["Purdue University"],"affiliations":[{"raw_affiliation_string":"Purdue University","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100362465","display_name":"Xiangyu Zhang","orcid":"https://orcid.org/0000-0003-2138-4608"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiangyu Zhang","raw_affiliation_strings":["Purdue University"],"affiliations":[{"raw_affiliation_string":"Purdue University","institution_ids":["https://openalex.org/I219193219"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5036121611"],"corresponding_institution_ids":["https://openalex.org/I219193219"],"apc_list":null,"apc_paid":null,"fwci":0.4164,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.69506132,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"65","last_page":"76"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9843000173568726,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8980857133865356},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6399766802787781},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5263142585754395},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5154879689216614},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5133683085441589},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5052638649940491},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4738999903202057},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.46269094944000244},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.455982506275177},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.41575154662132263},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.3306698203086853},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.1892123520374298}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8980857133865356},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6399766802787781},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5263142585754395},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5154879689216614},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5133683085441589},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5052638649940491},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4738999903202057},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.46269094944000244},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.455982506275177},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.41575154662132263},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3306698203086853},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.1892123520374298},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3522664.3528605","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3522664.3528605","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3522664.3528605","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 1st International Conference on AI Engineering: Software Engineering for AI","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3522664.3528605","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3522664.3528605","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3522664.3528605","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 1st International Conference on AI Engineering: Software Engineering for AI","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.6399999856948853,"id":"https://metadata.un.org/sdg/8","display_name":"Decent work and economic growth"}],"awards":[{"id":"https://openalex.org/G1448401626","display_name":null,"funder_award_id":"N000141712947","funder_id":"https://openalex.org/F4320337345","funder_display_name":"Office of Naval Research"},{"id":"https://openalex.org/G175455262","display_name":null,"funder_award_id":"W911NF-19-S-0012","funder_id":"https://openalex.org/F4320333051","funder_display_name":"Intelligence Advanced Research Projects Activity"},{"id":"https://openalex.org/G1865040260","display_name":null,"funder_award_id":"TrojAI W911NF-19-S-0012","funder_id":"https://openalex.org/F4320333051","funder_display_name":"Intelligence Advanced Research Projects Activity"},{"id":"https://openalex.org/G2304153372","display_name":null,"funder_award_id":"N000141410468","funder_id":"https://openalex.org/F4320337345","funder_display_name":"Office of Naval Research"},{"id":"https://openalex.org/G2614991508","display_name":null,"funder_award_id":"1910300","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5643251411","display_name":null,"funder_award_id":"and N00","funder_id":"https://openalex.org/F4320337345","funder_display_name":"Office of Naval Research"},{"id":"https://openalex.org/G608057831","display_name":null,"funder_award_id":"W911NF-19","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7284570364","display_name":null,"funder_award_id":"1901242","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7485119911","display_name":null,"funder_award_id":"N000141712045","funder_id":"https://openalex.org/F4320337345","funder_display_name":"Office of Naval Research"},{"id":"https://openalex.org/G8127390166","display_name":"For support to the NSTC Committee on Tecynology Innovation Workshop and subsequent policy analysis and implementation.","funder_award_id":"0001414","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8524305138","display_name":"STTR Phase I:  Customized Manufacture of Protective Headgear for Mitigation of Fall Related Injuries","funder_award_id":"1417120","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8535495650","display_name":"Multiscale Generalized Correlation: A Unified Distance-Based Correlation Measure for Dependency Discovery","funder_award_id":"1712947","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8876996369","display_name":null,"funder_award_id":"N00014","funder_id":"https://openalex.org/F4320337345","funder_display_name":"Office of Naval Research"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320333051","display_name":"Intelligence Advanced Research Projects Activity","ror":"https://ror.org/01v3fsc55"},{"id":"https://openalex.org/F4320337345","display_name":"Office of Naval Research","ror":"https://ror.org/00rk2pe57"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4312892127.pdf","grobid_xml":"https://content.openalex.org/works/W4312892127.grobid-xml"},"referenced_works_count":13,"referenced_works":["https://openalex.org/W2051387582","https://openalex.org/W2108598243","https://openalex.org/W2114067856","https://openalex.org/W2123511529","https://openalex.org/W2255006515","https://openalex.org/W2605106683","https://openalex.org/W2914209329","https://openalex.org/W2942091739","https://openalex.org/W2963178695","https://openalex.org/W3122184684","https://openalex.org/W4245223843","https://openalex.org/W4251521742","https://openalex.org/W4251828973"],"related_works":["https://openalex.org/W1517776641","https://openalex.org/W2242574529","https://openalex.org/W2391167130","https://openalex.org/W2092071486","https://openalex.org/W4283067488","https://openalex.org/W2099185950","https://openalex.org/W1572523360","https://openalex.org/W26297114","https://openalex.org/W2134879172","https://openalex.org/W1506438023"],"abstract_inverted_index":{"Checkpointing":[0],"and":[1,23,80,154,164],"faithful":[2,81,128],"replay":[3,65,165],"are":[4,125,132],"important":[5],"for":[6,83,127],"the":[7,28,104],"training":[8,32,46,67,87],"process":[9],"of":[10,43,106,121],"a":[11,44,66,75,91,113,119,136,143],"Deep":[12],"Learning":[13],"(DL)":[14],"model.":[15],"It":[16,158,171],"may":[17,48],"improve":[18],"productivity,":[19],"model":[20],"performance,":[21],"robustness,":[22],"help":[24],"security":[25],"auditing.":[26],"However,":[27],"inherent":[29],"nondeterminism":[30],"in":[31,103,135,175,178],"poses":[33],"prominent":[34],"challenges.":[35],"Even":[36],"with":[37,168],"fixed":[38],"random":[39,93,101],"seeds,":[40],"multiple":[41],"runs":[42],"same":[45],"pipeline":[47],"yield":[49],"models":[50,153],"whose":[51],"performance":[52],"varies":[53],"by":[54,140],"20%":[55],"percent.":[56],"With":[57],"existing":[58],"infrastructural":[59],"checkpointing":[60,79],"support,":[61],"developers":[62,174],"cannot":[63],"faithfully":[64],"process.":[68],"In":[69,109],"this":[70],"paper,":[71],"we":[72,111],"propose":[73],"DETrain,":[74],"new":[76],"solution":[77],"to":[78],"execution/replay":[82],"long":[84],"running":[85],"DL":[86],"programs.":[88],"We":[89],"introduce":[90],"novel":[92,114],"number":[94],"generation":[95],"mechanism":[96],"that":[97,116,124],"can":[98,117,159],"generate":[99],"consistent":[100],"numbers":[102],"presence":[105],"data":[107],"parallelism.":[108],"addition,":[110],"devise":[112],"analysis":[115],"determine":[118],"set":[120],"state":[122],"variables":[123,131],"necessary":[126],"replay.":[129],"These":[130],"either":[133],"saved":[134],"checkpoint":[137],"or":[138],"re-generated":[139],"fast":[141],"forwarding,":[142],"selective":[144],"execution":[145],"technique.":[146],"DETrain":[147],"is":[148],"evaluated":[149],"on":[150],"13":[151],"PyTorch":[152],"16":[155],"Tensorflow":[156],"models.":[157],"deterministically":[160],"execute":[161],"these":[162],"programs":[163],"from":[166],"checkpoints":[167],"reasonable":[169],"overhead.":[170],"also":[172],"helps":[173],"diagnosing":[176],"problems":[177],"training.":[179]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
