{"id":"https://openalex.org/W4414898297","doi":"https://doi.org/10.1109/cluster59342.2025.11186488","title":"Capricorn: Efficient In-Memory Checkpointing for MoE Model Training with Dynamicity Awareness","display_name":"Capricorn: Efficient In-Memory Checkpointing for MoE Model Training with Dynamicity Awareness","publication_year":2025,"publication_date":"2025-09-02","ids":{"openalex":"https://openalex.org/W4414898297","doi":"https://doi.org/10.1109/cluster59342.2025.11186488"},"language":"en","primary_location":{"id":"doi:10.1109/cluster59342.2025.11186488","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cluster59342.2025.11186488","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Cluster Computing (CLUSTER)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Wenqian Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenqian Xie","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055426388","display_name":"Zhiquan Lai","orcid":"https://orcid.org/0000-0002-3458-4732"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiquan Lai","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106623069","display_name":"Shengwei Li","orcid":"https://orcid.org/0000-0002-7419-1511"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengwei Li","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100668786","display_name":"Weijie Liu","orcid":"https://orcid.org/0000-0002-8023-9913"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weijie Liu","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100392064","display_name":"Wei Wang","orcid":"https://orcid.org/0000-0002-7180-831X"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Wang","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101206777","display_name":"Yanqi Hao","orcid":null},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanqi Hao","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100440919","display_name":"Dong\u2010Sheng Li","orcid":"https://orcid.org/0000-0003-1283-6334"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongsheng Li","raw_affiliation_strings":["College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I170215575"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31290372,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"12"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9757000207901001,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9757000207901001,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9531000256538391,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10927","display_name":"Access Control and Trust","score":0.9143999814987183,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/profiling","display_name":"Profiling (computer programming)","score":0.7871999740600586},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.7301999926567078},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.7044000029563904},{"id":"https://openalex.org/keywords/locality","display_name":"Locality","score":0.6323000192642212},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5932000279426575},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5455999970436096},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.4065000116825104},{"id":"https://openalex.org/keywords/layer","display_name":"Layer (electronics)","score":0.3864000141620636},{"id":"https://openalex.org/keywords/application-layer","display_name":"Application layer","score":0.37459999322891235}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8702999949455261},{"id":"https://openalex.org/C187191949","wikidata":"https://www.wikidata.org/wiki/Q1138496","display_name":"Profiling (computer programming)","level":2,"score":0.7871999740600586},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.7301999926567078},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.7044000029563904},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.6323000192642212},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.60589998960495},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5932000279426575},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5455999970436096},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.42800000309944153},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.4065000116825104},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.3864000141620636},{"id":"https://openalex.org/C190793597","wikidata":"https://www.wikidata.org/wiki/Q189768","display_name":"Application layer","level":3,"score":0.37459999322891235},{"id":"https://openalex.org/C42812","wikidata":"https://www.wikidata.org/wiki/Q1082910","display_name":"Partition (number theory)","level":2,"score":0.3513999879360199},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.3400999903678894},{"id":"https://openalex.org/C27602214","wikidata":"https://www.wikidata.org/wiki/Q1868547","display_name":"Locality of reference","level":3,"score":0.33000001311302185},{"id":"https://openalex.org/C2780860992","wikidata":"https://www.wikidata.org/wiki/Q16887485","display_name":"Network partition","level":2,"score":0.31520000100135803},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2888999879360199},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2793999910354614},{"id":"https://openalex.org/C7345512","wikidata":"https://www.wikidata.org/wiki/Q209372","display_name":"Transport layer","level":3,"score":0.2727000117301941},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.26109999418258667},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.26080000400543213},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.25380000472068787},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cluster59342.2025.11186488","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cluster59342.2025.11186488","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Cluster Computing (CLUSTER)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5641074669","display_name":null,"funder_award_id":"62025208,62421002","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":15,"referenced_works":["https://openalex.org/W2965373594","https://openalex.org/W3044837714","https://openalex.org/W3081168214","https://openalex.org/W3129831491","https://openalex.org/W4220741164","https://openalex.org/W4229801112","https://openalex.org/W4372267133","https://openalex.org/W4386396242","https://openalex.org/W4387302750","https://openalex.org/W4394923484","https://openalex.org/W4394998727","https://openalex.org/W4398796293","https://openalex.org/W4399757647","https://openalex.org/W4401211627","https://openalex.org/W4407216788"],"related_works":[],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE)":[1],"has":[2],"been":[3],"extensively":[4],"adopted":[5],"for":[6,90,201],"its":[7],"incredible":[8],"capability":[9],"to":[10,41,74,138,167,217],"expand":[11],"model":[12,48,59,93,115,170],"scale":[13],"with":[14,169,174,205],"a":[15,38,85],"sub-linear":[16],"increase":[17],"in":[18,58,69],"computational":[19],"requirement.":[20],"Training":[21],"MoE":[22,71,92,212],"models":[23],"requires":[24],"substantial":[25],"computing":[26],"nodes":[27],"and":[28,107,163,184,222,232],"extended":[29],"periods,":[30],"necessitating":[31],"reliable":[32],"distributed":[33,70],"training":[34,43,60,228],"systems.":[35],"Checkpointing":[36],"is":[37],"common":[39],"approach":[40,89],"enhance":[42],"reliability":[44],"by":[45],"periodically":[46],"saving":[47],"states.":[49],"Current":[50],"checkpointing":[51,77,88,165],"optimizations":[52],"focus":[53],"on":[54,153],"hiding":[55],"checkpoint":[56,207],"overhead":[57,204],"computations.":[61,171],"However,":[62],"these":[63],"approaches":[64],"overlook":[65],"the":[66,98,105,111,123,126,130,139,147,154,175,182,188,192],"dynamicity":[67,99,176],"inherent":[68],"training,":[72],"leading":[73],"an":[75,198],"inefficient":[76],"mechanism.":[78],"In":[79],"this":[80],"paper,":[81],"we":[82],"propose":[83],"Capricorn,":[84],"dynamicity-aware":[86],"in-memory":[87],"efficient":[91],"training.":[94],"We":[95],"observe":[96],"that":[97],"impacts":[100],"computation":[101,119,127],"durations":[102],"at":[103,122,146],"both":[104],"layer":[106,112,132],"iteration":[108,124],"levels.":[109],"At":[110],"level,":[113,125],"different":[114],"layers":[116],"exhibit":[117],"various":[118],"durations,":[120],"while":[121],"time":[128],"of":[129,149,191],"same":[131],"differs":[133],"across":[134,177],"iterations.":[135],"To":[136,172],"adapt":[137],"layer-level":[140],"dynamicity,":[141],"Capricorn":[142,179,214],"employs":[143],"online":[144],"profiling":[145,155,183],"granularity":[148],"individual":[150],"layers.":[151],"Based":[152],"results,":[156],"it":[157],"strategically":[158],"partitions":[159],"checkpoints":[160],"into":[161],"chunks":[162],"schedules":[164],"communication":[166],"overlap":[168],"deal":[173],"iterations,":[178],"speculatively":[180],"activates":[181],"partitioning":[185],"processes":[186],"utilizing":[187],"temporal":[189],"locality":[190],"experts'":[193],"load.":[194],"It":[195],"can":[196],"produce":[197],"optimal":[199],"activation":[200],"low":[202],"runtime":[203],"high":[206],"partition":[208],"accuracy.":[209],"For":[210],"mainstream":[211],"models,":[213],"achieves":[215],"up":[216],"<tex":[218,223],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[219,224],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$1.56":[220],"\\times$</tex>":[221,226],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$5.98":[225],"end-to-end":[227],"speedup":[229],"over":[230],"Gemini":[231],"TorchSnapshot":[233],"respectively":[234],"under":[235],"per-iteration":[236],"checkpointing.":[237]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
