{"id":"https://openalex.org/W4404340628","doi":"https://doi.org/10.1145/3652892.3700781","title":"Deep Optimizer States: Towards Scalable Training of Transformer Models using Interleaved Offloading","display_name":"Deep Optimizer States: Towards Scalable Training of Transformer Models using Interleaved Offloading","publication_year":2024,"publication_date":"2024-11-27","ids":{"openalex":"https://openalex.org/W4404340628","doi":"https://doi.org/10.1145/3652892.3700781"},"language":"en","primary_location":{"id":"doi:10.1145/3652892.3700781","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3652892.3700781","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 25th International Middleware Conference","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3652892.3700781","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049218913","display_name":"Avinash Maurya","orcid":"https://orcid.org/0000-0002-8200-0148"},"institutions":[{"id":"https://openalex.org/I155173764","display_name":"Rochester Institute of Technology","ror":"https://ror.org/00v4yb702","country_code":"US","type":"education","lineage":["https://openalex.org/I155173764"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Avinash Maurya","raw_affiliation_strings":["Rochester Institute of Technology, Rochester, New York, United States"],"affiliations":[{"raw_affiliation_string":"Rochester Institute of Technology, Rochester, New York, United States","institution_ids":["https://openalex.org/I155173764"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106355306","display_name":"Jie Ye","orcid":null},"institutions":[{"id":"https://openalex.org/I180949307","display_name":"Illinois Institute of Technology","ror":"https://ror.org/037t3ry66","country_code":"US","type":"education","lineage":["https://openalex.org/I180949307"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jie Ye","raw_affiliation_strings":["Illinois Institute of Technology, Chicago, Illinois, USA"],"affiliations":[{"raw_affiliation_string":"Illinois Institute of Technology, Chicago, Illinois, USA","institution_ids":["https://openalex.org/I180949307"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054011637","display_name":"M. Mustafa Rafique","orcid":"https://orcid.org/0000-0002-5034-2880"},"institutions":[{"id":"https://openalex.org/I155173764","display_name":"Rochester Institute of Technology","ror":"https://ror.org/00v4yb702","country_code":"US","type":"education","lineage":["https://openalex.org/I155173764"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"M. Mustafa Rafique","raw_affiliation_strings":["Rochester Institute of Technology, Rochester, New York, United States"],"affiliations":[{"raw_affiliation_string":"Rochester Institute of Technology, Rochester, New York, United States","institution_ids":["https://openalex.org/I155173764"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046613458","display_name":"Franck Cappello","orcid":"https://orcid.org/0000-0002-7890-3934"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Franck Cappello","raw_affiliation_strings":["Argonne National Laboratory, Lemont, Illinois, USA"],"affiliations":[{"raw_affiliation_string":"Argonne National Laboratory, Lemont, Illinois, USA","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085745891","display_name":"Bogdan Nicolae","orcid":"https://orcid.org/0000-0002-0661-7509"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bogdan Nicolae","raw_affiliation_strings":["Argonne National Laboratory, Lemont, Illinois, United States"],"affiliations":[{"raw_affiliation_string":"Argonne National Laboratory, Lemont, Illinois, United States","institution_ids":["https://openalex.org/I1282105669"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5049218913"],"corresponding_institution_ids":["https://openalex.org/I155173764"],"apc_list":null,"apc_paid":null,"fwci":1.5107,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":{"value":0.84417188,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"404","last_page":"416"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11206","display_name":"Model Reduction and Neural Networks","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/3109","display_name":"Statistical and Nonlinear Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11263","display_name":"Electromagnetic Simulation and Numerical Methods","score":0.9761999845504761,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7212272882461548},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7092209458351135},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5661832094192505},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4964151978492737},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3827052116394043},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.0892782211303711},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08187076449394226},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.07942652702331543}],"concepts":[{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7212272882461548},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7092209458351135},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5661832094192505},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4964151978492737},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3827052116394043},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0892782211303711},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08187076449394226},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.07942652702331543},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3652892.3700781","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3652892.3700781","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 25th International Middleware Conference","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2410.21316","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.21316","pdf_url":"https://arxiv.org/pdf/2410.21316","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3652892.3700781","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3652892.3700781","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 25th International Middleware Conference","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2131453672","display_name":null,"funder_award_id":"2106635","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G2777053550","display_name":null,"funder_award_id":"AC02-06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G3075337988","display_name":null,"funder_award_id":"06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G3357838806","display_name":null,"funder_award_id":"2411386","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3906604551","display_name":null,"funder_award_id":"DEAC02-06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G4252079742","display_name":null,"funder_award_id":"DEAC02-06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G4552540348","display_name":null,"funder_award_id":"0F-60169","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G5085543421","display_name":null,"funder_award_id":"AC02-06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G5367729914","display_name":null,"funder_award_id":"0F-60169","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G6755165505","display_name":null,"funder_award_id":"award","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G6848031779","display_name":null,"funder_award_id":"06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G6939511203","display_name":null,"funder_award_id":"DEAC02-06CH1135","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G7140536155","display_name":null,"funder_award_id":"2411387","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G848032724","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8981335390","display_name":null,"funder_award_id":"DEAC02-06CH11357","funder_id":"https://openalex.org/F4320337506","funder_display_name":"Advanced Scientific Computing Research"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320337506","display_name":"Advanced Scientific Computing Research","ror":"https://ror.org/0012c7r22"},{"id":"https://openalex.org/F4320338284","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1810943226","https://openalex.org/W2146502635","https://openalex.org/W2734941459","https://openalex.org/W2763421725","https://openalex.org/W2783292547","https://openalex.org/W2901299405","https://openalex.org/W2969388332","https://openalex.org/W2985738161","https://openalex.org/W3010830594","https://openalex.org/W3021124033","https://openalex.org/W3038581078","https://openalex.org/W3081168214","https://openalex.org/W3119866685","https://openalex.org/W3121562065","https://openalex.org/W3129831491","https://openalex.org/W3205803342","https://openalex.org/W3206606249","https://openalex.org/W3216052173","https://openalex.org/W4221167110","https://openalex.org/W4287594180","https://openalex.org/W4321636578","https://openalex.org/W4384918448","https://openalex.org/W4385623144","https://openalex.org/W4386707654","https://openalex.org/W4387559225","https://openalex.org/W4393186514","https://openalex.org/W4394947477","https://openalex.org/W4399757647","https://openalex.org/W4399794472","https://openalex.org/W6745245109","https://openalex.org/W6839710751"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W230091440","https://openalex.org/W2390279801","https://openalex.org/W2233261550","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2810751659"],"abstract_inverted_index":{"Transformers":[0],"and":[1,23,36,51,93,109,117,130,135,152,178,190,237,248],"large":[2],"language":[3],"models":[4],"(LLMs)":[5],"have":[6,15],"seen":[7],"rapid":[8],"adoption":[9],"in":[10,75,111,157],"all":[11],"domains.":[12],"Their":[13],"sizes":[14],"exploded":[16],"to":[17,63,89,121,124,166,198],"hundreds":[18],"of":[19,21,31,55,101,133,148,171],"billions":[20],"parameters":[22],"keep":[24],"increasing.":[25],"Under":[26],"these":[27],"circumstances,":[28],"the":[29,53,65,83,90,99,102,127,146,149,158,172,176,179,200,211,214,224,232,235],"training":[30],"transformers":[32],"is":[33,59,106,207],"very":[34],"expensive":[35],"often":[37,107],"hits":[38],"a":[39,142,169,195],"\"memory":[40],"wall\",":[41],"i.e.,":[42],"even":[43],"when":[44],"using":[45,256],"3D":[46],"parallelism":[47],"(pipeline,":[48],"tensor,":[49],"data)":[50],"aggregating":[52],"memory":[54,92,105,160,181],"many":[56],"GPUs,":[57],"it":[58],"still":[60],"not":[61],"enough":[62],"hold":[64],"necessary":[66],"data":[67,115,227],"structures":[68],"(model":[69],"parameters,":[70],"optimizer":[71,84,173],"state,":[72,85],"gradients,":[73],"activations)":[74],"GPU":[76,159,180,215],"memory.":[77],"To":[78,185],"compensate,":[79],"state-of-the-art":[80,254],"approaches":[81,255],"offload":[82],"at":[86,182],"least":[87],"partially,":[88],"host":[91,177],"perform":[94],"hybrid":[95],"CPU-GPU":[96],"computations.":[97,118],"However,":[98],"management":[100],"combined":[103],"host-GPU":[104],"suboptimal":[108],"results":[110],"poor":[112],"overlapping":[113],"between":[114,175,226],"movements":[116],"This":[119],"leads":[120],"missed":[122],"opportunities":[123],"simultaneously":[125],"leverage":[126,141],"interconnect":[128],"bandwidth":[129],"computational":[131],"capabilities":[132],"CPUs":[134],"GPUs.":[136],"In":[137],"this":[138,186],"paper,":[139],"we":[140,188],"key":[143],"observation":[144],"that":[145,222],"interleaving":[147],"forward,":[150],"backward":[151],"update":[153,205],"phases":[154],"generate":[155],"fluctuations":[156],"utilization,":[161],"which":[162],"can":[163],"be":[164],"exploited":[165],"dynamically":[167],"move":[168],"part":[170],"state":[174],"each":[183],"iteration.":[184],"end,":[187],"design":[189],"implement":[191],"Deep":[192],"Optimizer":[193],"States,":[194],"novel":[196],"technique":[197],"split":[199],"LLM":[201],"into":[202],"subgroups,":[203],"whose":[204],"phase":[206],"scheduled":[208],"on":[209,217,231],"either":[210],"CPU":[212],"or":[213],"based":[216],"our":[218,244],"proposed":[219],"performance":[220],"model":[221],"addresses":[223],"trade-off":[225],"movement":[228],"cost,":[229],"acceleration":[230],"GPUs":[233],"vs":[234],"CPUs,":[236],"competition":[238],"for":[239],"shared":[240],"resources.":[241],"We":[242],"integrate":[243],"approach":[245],"with":[246],"DeepSpeed":[247],"demonstrate":[249],"2.5\u00d7":[250],"faster":[251],"iterations":[252],"over":[253],"extensive":[257],"experiments.":[258]},"counts_by_year":[{"year":2025,"cited_by_count":6}],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2025-10-10T00:00:00"}
