{"id":"https://openalex.org/W4414197649","doi":"https://doi.org/10.1109/dac63849.2025.11132754","title":"SSDTrain: An Activation Offloading Framework to SSDs for Faster Large Language Model Training","display_name":"SSDTrain: An Activation Offloading Framework to SSDs for Faster Large Language Model Training","publication_year":2025,"publication_date":"2025-06-22","ids":{"openalex":"https://openalex.org/W4414197649","doi":"https://doi.org/10.1109/dac63849.2025.11132754"},"language":"en","primary_location":{"id":"doi:10.1109/dac63849.2025.11132754","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132754","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073106039","display_name":"Kun Wu","orcid":"https://orcid.org/0000-0002-0149-1409"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kun Wu","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100776534","display_name":"Jeongmin Park","orcid":"https://orcid.org/0000-0001-8027-0876"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jeongmin Brian Park","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100330732","display_name":"Xiaofan Zhang","orcid":"https://orcid.org/0000-0001-7048-4803"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaofan Zhang","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024169920","display_name":"Mert Hidayeto\u011flu","orcid":"https://orcid.org/0000-0001-9276-5075"},"institutions":[{"id":"https://openalex.org/I204250578","display_name":"University of California, Irvine","ror":"https://ror.org/04gyf1771","country_code":"US","type":"education","lineage":["https://openalex.org/I204250578"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mert Hidayeto\u011flu","raw_affiliation_strings":["University of California,Irvine"],"affiliations":[{"raw_affiliation_string":"University of California,Irvine","institution_ids":["https://openalex.org/I204250578"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041724753","display_name":"Vikram Sharma Mailthody","orcid":"https://orcid.org/0000-0002-9611-8075"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vikram Sharma Mailthody","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113182643","display_name":"Sitao Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sitao Huang","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052474229","display_name":"Steve Lumetta","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Steve Lumetta","raw_affiliation_strings":["Google"],"affiliations":[{"raw_affiliation_string":"Google","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040404999","display_name":"Wen\u2010mei Hwu","orcid":"https://orcid.org/0000-0003-2532-5349"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen-Mei Hwu","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5073106039"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.7137,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.95110938,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.8180999755859375,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.8180999755859375,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7458999752998352},{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.6815999746322632},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.5885999798774719},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.38940000534057617},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.35190001130104065},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.34459999203681946},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.34119999408721924},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.34049999713897705}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8429999947547913},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7458999752998352},{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.6815999746322632},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5942999720573425},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.5885999798774719},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.38940000534057617},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3521000146865845},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.35190001130104065},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.35190001130104065},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.34459999203681946},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.34119999408721924},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.34049999713897705},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.3336000144481659},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.31850001215934753},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.2847000062465668},{"id":"https://openalex.org/C2776834041","wikidata":"https://www.wikidata.org/wiki/Q25346349","display_name":"Execution model","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C177950962","wikidata":"https://www.wikidata.org/wiki/Q10997658","display_name":"Non-volatile memory","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2599000036716461},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.257099986076355},{"id":"https://openalex.org/C184596265","wikidata":"https://www.wikidata.org/wiki/Q2651576","display_name":"Model of computation","level":3,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dac63849.2025.11132754","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132754","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320309327","display_name":"Google","ror":"https://ror.org/00njsd438"},{"id":"https://openalex.org/F4320316786","display_name":"Center for Cognitive Computing Systems Research","ror":null},{"id":"https://openalex.org/F4320317220","display_name":"National Energy Research Scientific Computing Center","ror":"https://ror.org/05v3mvq14"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W2010390358","https://openalex.org/W2489529491","https://openalex.org/W3012479151","https://openalex.org/W3012514909","https://openalex.org/W3129831491","https://openalex.org/W3205803342","https://openalex.org/W4247353671","https://openalex.org/W4251178606","https://openalex.org/W4287413833","https://openalex.org/W4308273116","https://openalex.org/W4321636575","https://openalex.org/W4321636583","https://openalex.org/W4385245566","https://openalex.org/W4387321091","https://openalex.org/W4401408886","https://openalex.org/W4402671659"],"related_works":[],"abstract_inverted_index":{"The":[0],"growth":[1],"rate":[2],"of":[3,17,20,141],"the":[4,18,26,44,62,142,148,154,157,176,188],"GPU":[5,45,84,168],"memory":[6,46,85,145,169,178,191],"capacity":[7],"has":[8],"not":[9],"been":[10],"able":[11],"to":[12,50,61,78,120,196],"keep":[13],"up":[14],"with":[15,95,100,127,156,164,180],"that":[16,137],"size":[19,202],"large":[21],"language":[22],"models":[23],"(LLMs),":[24],"hindering":[25],"model":[27],"training":[28,52],"process.":[29],"In":[30],"particular,":[31],"activations-the":[32],"intermediate":[33],"tensors":[34],"produced":[35],"during":[36],"forward":[37],"propagation":[38],"and":[39,108,110,118,133,159,170,203],"reused":[40],"in":[41,167],"backward":[42],"propagation-dominate":[43],"use.":[47],"This":[48],"leads":[49],"high":[51],"overheads":[53],"such":[54,114],"as":[55,115],"expensive":[56],"weight":[57],"update":[58],"costs":[59],"due":[60],"small":[63],"micro-batch":[64,201],"size.":[65],"To":[66],"address":[67],"this":[68],"challenge,":[69],"we":[70],"propose":[71],"SSDTrain,":[72],"an":[73],"adaptive":[74],"activation":[75,143,190],"offloading":[76],"framework":[77],"high-capacity":[79],"NVMe":[80],"SSDs.":[81],"SSDTrain":[82,97,138,151,174],"reduces":[83,139],"usage":[86],"without":[87],"impacting":[88],"performance":[89],"by":[90,199],"fully":[91],"overlapping":[92],"data":[93],"transfers":[94],"computation.":[96],"is":[98],"compatible":[99],"popular":[101,128],"deep":[102],"learning":[103],"frameworks":[104],"like":[105,130],"PyTorch,":[106],"Megatron,":[107],"DeepSpeed,":[109],"it":[111],"employs":[112],"techniques":[113],"tensor":[116],"deduplication":[117],"forwarding":[119],"further":[121,185],"enhance":[122],"efficiency.":[123],"We":[124,184],"extensively":[125],"experimented":[126],"LLMs":[129],"GPT,":[131],"BERT,":[132],"T5.":[134],"Results":[135],"demonstrate":[136],"47%":[140],"peak":[144],"usage.":[146],"At":[147],"same":[149],"time,":[150],"perfectly":[152],"overlaps":[153],"I/O":[155],"computation":[158],"incurs":[160],"negligible":[161,181],"overhead.":[162],"Compared":[163],"keeping":[165],"activations":[166],"layerwise":[171],"full":[172],"recomputation,":[173],"achieves":[175],"best":[177],"savings":[179],"throughput":[182,198],"loss.":[183],"analyze":[186],"how":[187],"reduced":[189],"use":[192],"may":[193],"be":[194],"leveraged":[195],"increase":[197],"increasing":[200],"reducing":[204],"pipeline":[205],"parallelism":[206],"bubbles.":[207]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
