{"id":"https://openalex.org/W4413126132","doi":"https://doi.org/10.1109/lca.2025.3596616","title":"Checkflow: Low-Overhead Checkpointing for Deep Learning Training","display_name":"Checkflow: Low-Overhead Checkpointing for Deep Learning Training","publication_year":2025,"publication_date":"2025-07-01","ids":{"openalex":"https://openalex.org/W4413126132","doi":"https://doi.org/10.1109/lca.2025.3596616"},"language":"en","primary_location":{"id":"doi:10.1109/lca.2025.3596616","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3596616","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108270566","display_name":"Hangyu Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hangyu Liu","raw_affiliation_strings":["Southwest Jiaotong University, Chengdu, China","Southwest Jiaotong University, China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University, Chengdu, China","institution_ids":["https://openalex.org/I4800084"]},{"raw_affiliation_string":"Southwest Jiaotong University, China","institution_ids":["https://openalex.org/I4800084"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075454046","display_name":"Shouxi Luo","orcid":"https://orcid.org/0000-0002-4041-3681"},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shouxi Luo","raw_affiliation_strings":["Southwest Jiaotong University, Chengdu, China","Southwest Jiaotong University, China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University, Chengdu, China","institution_ids":["https://openalex.org/I4800084"]},{"raw_affiliation_string":"Southwest Jiaotong University, China","institution_ids":["https://openalex.org/I4800084"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100343481","display_name":"Ke Li","orcid":"https://orcid.org/0000-0002-2189-7967"},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ke Li","raw_affiliation_strings":["Southwest Jiaotong University, Chengdu, China","Southwest Jiaotong University, China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University, Chengdu, China","institution_ids":["https://openalex.org/I4800084"]},{"raw_affiliation_string":"Southwest Jiaotong University, China","institution_ids":["https://openalex.org/I4800084"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007865788","display_name":"Huanlai Xing","orcid":"https://orcid.org/0000-0002-6345-7265"},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huanlai Xing","raw_affiliation_strings":["Southwest Jiaotong University, Chengdu, China","Southwest Jiaotong University, China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University, Chengdu, China","institution_ids":["https://openalex.org/I4800084"]},{"raw_affiliation_string":"Southwest Jiaotong University, China","institution_ids":["https://openalex.org/I4800084"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5010159266","display_name":"Bo Peng","orcid":"https://orcid.org/0000-0002-8694-5106"},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Peng","raw_affiliation_strings":["Southwest Jiaotong University, Chengdu, China","Southwest Jiaotong University, China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University, Chengdu, China","institution_ids":["https://openalex.org/I4800084"]},{"raw_affiliation_string":"Southwest Jiaotong University, China","institution_ids":["https://openalex.org/I4800084"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5108270566"],"corresponding_institution_ids":["https://openalex.org/I4800084"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.11566908,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"24","issue":"2","first_page":"281","last_page":"284"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10237","display_name":"Cryptography and Data Security","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10237","display_name":"Cryptography and Data Security","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.9747999906539917,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9645000100135803,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8356457948684692},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.762543261051178},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.7253533005714417},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5809453725814819},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.47474366426467896},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4658644497394562},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.4054843783378601},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.34434932470321655},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.33385777473449707},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.3267684876918793},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.3011440634727478}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8356457948684692},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.762543261051178},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.7253533005714417},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5809453725814819},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.47474366426467896},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4658644497394562},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.4054843783378601},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.34434932470321655},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.33385777473449707},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3267684876918793},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3011440634727478},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lca.2025.3596616","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3596616","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4375867731","https://openalex.org/W2611989081","https://openalex.org/W2418291489","https://openalex.org/W2731899572","https://openalex.org/W4230611425","https://openalex.org/W4294635752","https://openalex.org/W4304166257","https://openalex.org/W4383066092","https://openalex.org/W3215138031","https://openalex.org/W3096519538"],"abstract_inverted_index":{"During":[0],"the":[1,10,20,33,52,91,111,114,123,130,138,147],"time-consuming":[2],"training":[3,54,75,83,131,145,148],"of":[4,36,82,87,93,113,144],"deep":[5],"neural":[6],"network":[7],"(DNN)":[8],"models,":[9],"worker":[11],"has":[12,126],"to":[13,26,32],"periodically":[14],"create":[15,41],"checkpoints":[16,42],"for":[17,73,129,141],"tensors":[18],"like":[19],"model":[21],"parameters":[22],"and":[23,103],"optimizer":[24],"state":[25],"support":[27],"fast":[28],"failover.":[29],"However,":[30],"due":[31],"high":[34],"overhead":[35,154],"checkpointing,":[37],"existing":[38],"schemes":[39],"generally":[40],"at":[43],"a":[44,65,97],"very":[45],"low":[46],"frequency,":[47],"making":[48],"recovery":[49],"inefficient":[50],"since":[51],"unsaved":[53],"progress":[55],"would":[56],"get":[57],"lost.":[58],"In":[59],"this":[60],"paper,":[61],"we":[62],"propose":[63],"Checkflow,":[64],"low-overhead":[66],"checkpointing":[67,72],"scheme,":[68],"which":[69],"enables":[70],"per-iteration":[71],"DNN":[74],"with":[76,146,150],"minimal":[77],"or":[78,152],"even":[79],"zero":[80],"cost":[81],"slowdown.":[84],"The":[85],"power":[86],"Checkflow":[88,133],"stems":[89],"from":[90],"design":[92],"<inline-formula><tex-math":[94,104],"notation=\"LaTeX\">$i)$</tex-math></inline-formula>":[95],"decoupling":[96],"tensor\u2019s":[98],"checkpoint":[99,139],"operation":[100],"into":[101],"snapshot-then-offload,":[102],"notation=\"LaTeX\">$ii)$</tex-math></inline-formula>":[105],"scheduling":[106],"these":[107],"operations":[108,140],"appropriately,":[109],"following":[110],"results":[112,119],"math":[115],"models.":[116],"Our":[117],"experimental":[118],"imply":[120],"that,":[121],"when":[122],"GPU-CPU":[124],"connection":[125],"sufficient":[127],"bandwidth":[128],"workload,":[132],"can":[134],"theoretically":[135],"overlap":[136],"all":[137],"each":[142],"round":[143],"computation,":[149],"trivial":[151],"no":[153],"in":[155],"peak":[156],"GPU":[157],"memory":[158],"occupancy.":[159]},"counts_by_year":[],"updated_date":"2025-12-28T23:10:05.387466","created_date":"2025-10-10T00:00:00"}
