{"id":"https://openalex.org/W4404623651","doi":"https://doi.org/10.1088/2632-2153/ad9667","title":"ZeROf-Offload: forward-gradient scheme for efficient full parameter fine-tuning of billion-scale language models","display_name":"ZeROf-Offload: forward-gradient scheme for efficient full parameter fine-tuning of billion-scale language models","publication_year":2024,"publication_date":"2024-11-22","ids":{"openalex":"https://openalex.org/W4404623651","doi":"https://doi.org/10.1088/2632-2153/ad9667"},"language":"en","primary_location":{"id":"doi:10.1088/2632-2153/ad9667","is_oa":true,"landing_page_url":"https://doi.org/10.1088/2632-2153/ad9667","pdf_url":null,"source":{"id":"https://openalex.org/S4210200687","display_name":"Machine Learning Science and Technology","issn_l":"2632-2153","issn":["2632-2153"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320083","host_organization_name":"IOP Publishing","host_organization_lineage":["https://openalex.org/P4310320083","https://openalex.org/P4310311669"],"host_organization_lineage_names":["IOP Publishing","Institute of Physics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning: Science and Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1088/2632-2153/ad9667","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043296519","display_name":"Jian Zhu","orcid":"https://orcid.org/0000-0002-4198-9009"},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jian Zhu","raw_affiliation_strings":["School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China","State Key Laboratory of Strength & Vibration of Mechanical Structures, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China"],"affiliations":[{"raw_affiliation_string":"School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China","institution_ids":["https://openalex.org/I87445476"]},{"raw_affiliation_string":"State Key Laboratory of Strength & Vibration of Mechanical Structures, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010660790","display_name":"Peng Feng","orcid":"https://orcid.org/0000-0002-2997-3899"},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peicheng Feng","raw_affiliation_strings":["School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China"],"affiliations":[{"raw_affiliation_string":"School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088992593","display_name":"Jiawei Lu","orcid":"https://orcid.org/0000-0001-8003-2114"},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiawei Lu","raw_affiliation_strings":["School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China"],"affiliations":[{"raw_affiliation_string":"School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076410548","display_name":"Bo Fang","orcid":"https://orcid.org/0009-0000-5740-2412"},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bowei Fang","raw_affiliation_strings":["School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China"],"affiliations":[{"raw_affiliation_string":"School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102112058","display_name":"Hannah Honghua Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]},{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hesong Yang","raw_affiliation_strings":["Business Technology Department, Xiaohongshu, Shanghai 200025, People\u2019s Republic of China","School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China"],"affiliations":[{"raw_affiliation_string":"Business Technology Department, Xiaohongshu, Shanghai 200025, People\u2019s Republic of China","institution_ids":["https://openalex.org/I862669128"]},{"raw_affiliation_string":"School of Mechanical Engineering, Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi 710049, People\u2019s Republic of China","institution_ids":["https://openalex.org/I87445476"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5043296519"],"corresponding_institution_ids":["https://openalex.org/I87445476"],"apc_list":{"value":1600,"currency":"GBP","value_usd":1962},"apc_paid":{"value":1600,"currency":"GBP","value_usd":1962},"fwci":0.6909,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.76852605,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"5","issue":"4","first_page":"045054","last_page":"045054"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9501000046730042,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8130548000335693},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6083202958106995},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.5864295959472656},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5287731885910034},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4966591000556946},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.43887796998023987},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.436451256275177},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4359845519065857},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.17549169063568115}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8130548000335693},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6083202958106995},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.5864295959472656},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5287731885910034},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4966591000556946},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.43887796998023987},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.436451256275177},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4359845519065857},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.17549169063568115},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C66938386","wikidata":"https://www.wikidata.org/wiki/Q633538","display_name":"Structural engineering","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C555944384","wikidata":"https://www.wikidata.org/wiki/Q249","display_name":"Wireless","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1088/2632-2153/ad9667","is_oa":true,"landing_page_url":"https://doi.org/10.1088/2632-2153/ad9667","pdf_url":null,"source":{"id":"https://openalex.org/S4210200687","display_name":"Machine Learning Science and Technology","issn_l":"2632-2153","issn":["2632-2153"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320083","host_organization_name":"IOP Publishing","host_organization_lineage":["https://openalex.org/P4310320083","https://openalex.org/P4310311669"],"host_organization_lineage_names":["IOP Publishing","Institute of Physics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning: Science and Technology","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:1fe8bb06603d43569e6d650aa679d5f0","is_oa":true,"landing_page_url":"https://doaj.org/article/1fe8bb06603d43569e6d650aa679d5f0","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Machine Learning: Science and Technology, Vol 5, Iss 4, p 045054 (2024)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1088/2632-2153/ad9667","is_oa":true,"landing_page_url":"https://doi.org/10.1088/2632-2153/ad9667","pdf_url":null,"source":{"id":"https://openalex.org/S4210200687","display_name":"Machine Learning Science and Technology","issn_l":"2632-2153","issn":["2632-2153"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320083","host_organization_name":"IOP Publishing","host_organization_lineage":["https://openalex.org/P4310320083","https://openalex.org/P4310311669"],"host_organization_lineage_names":["IOP Publishing","Institute of Physics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning: Science and Technology","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Decent work and economic growth","score":0.5,"id":"https://metadata.un.org/sdg/8"}],"awards":[{"id":"https://openalex.org/G2496610418","display_name":null,"funder_award_id":"2022M712540","funder_id":"https://openalex.org/F4320335443","funder_display_name":"Postdoctoral Research Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320335443","display_name":"Postdoctoral Research Foundation of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W2014702547","https://openalex.org/W2054591139","https://openalex.org/W2892341857","https://openalex.org/W2919358988","https://openalex.org/W2942231644","https://openalex.org/W3025017204","https://openalex.org/W3086105743","https://openalex.org/W3129831491","https://openalex.org/W3204998121","https://openalex.org/W3205803342","https://openalex.org/W3211022409","https://openalex.org/W4322766882","https://openalex.org/W4382246105","https://openalex.org/W4387490420","https://openalex.org/W6631190155","https://openalex.org/W6635935089","https://openalex.org/W6703652217","https://openalex.org/W6745245109","https://openalex.org/W6747638306","https://openalex.org/W6751238189","https://openalex.org/W6752778037","https://openalex.org/W6753584990","https://openalex.org/W6756379755","https://openalex.org/W6756718674","https://openalex.org/W6768723914","https://openalex.org/W6770176827","https://openalex.org/W6774125022","https://openalex.org/W6787953186","https://openalex.org/W6794586554","https://openalex.org/W6810081322","https://openalex.org/W6810476776","https://openalex.org/W6840814951","https://openalex.org/W6849081016","https://openalex.org/W6850625674","https://openalex.org/W6851329549","https://openalex.org/W6852248262","https://openalex.org/W6854084413","https://openalex.org/W6854176537","https://openalex.org/W6854319068","https://openalex.org/W6857107603","https://openalex.org/W6857455670","https://openalex.org/W6858461694"],"related_works":["https://openalex.org/W2317245370","https://openalex.org/W2076915000","https://openalex.org/W4249323025","https://openalex.org/W198851386","https://openalex.org/W2030310580","https://openalex.org/W947442053","https://openalex.org/W1980160788","https://openalex.org/W2148915962","https://openalex.org/W2283866686","https://openalex.org/W4287182096"],"abstract_inverted_index":{"Abstract":[0],"In":[1,45],"large":[2],"language":[3],"models":[4,104,123],"(LLMs),":[5],"full-parameter":[6],"fine-tuning":[7,58,75],"is":[8],"crucial":[9],"for":[10,103,122,143],"task-specific":[11],"adaptation.":[12],"Traditionally,":[13],"this":[14,26,46],"relies":[15],"on":[16,110],"deep":[17],"learning":[18],"training":[19],"frameworks":[20],"utilizing":[21],"the":[22,40,50,62,81,91,95,130,159,172,177],"back-propagation":[23],"scheme.":[24,64],"However,":[25],"scheme":[27],"presents":[28],"inherent":[29],"issues,":[30],"e.g.":[31],"activation":[32],"memory":[33,136],"bottlenecks":[34],"and":[35,52,84,158,164],"backward":[36,96],"locking,":[37],"which":[38,118],"limit":[39],"efficient":[41],"computational":[42,86],"resource":[43],"usage.":[44],"work,":[47],"we":[48,139],"propose":[49],"design":[51],"analysis":[53],"of":[54,76,93],"ZeROf-Offload,":[55],"an":[56],"innovative":[57],"framework":[59,66],"that":[60,171],"adapts":[61],"forward-gradient":[63],"This":[65],"adopts":[67],"a":[68,111],"unique":[69],"forward-gradient-oriented":[70],"CPU":[71],"offload":[72],"strategy,":[73],"enabling":[74],"billion-scale":[77],"LLMs":[78],"solely":[79],"in":[80,98],"forward":[82],"phase":[83,97],"enhancing":[85],"efficiency.":[87],"Empirical":[88],"evaluations":[89],"reveal":[90],"advantage":[92],"eliminating":[94],"fine-tuning.":[99],"ZeROf-Offload":[100,142,174],"achieves134":[101],"TFlops/GPU":[102,121],"with":[105,124,146],"over":[106],"130":[107],"billion":[108,128],"parameters":[109],"single":[112],"DGX-A100":[113],"node,":[114],"outperforming":[115],"DeepSpeed\u2019s":[116],"ZeRO-Offload,":[117],"achieves":[119],"102":[120],"up":[125,154],"to":[126,155,182],"53.7":[127],"parameters,":[129],"largest":[131],"size":[132],"manageable":[133],"within":[134],"GPU":[135],"limitations.":[137],"Furthermore,":[138],"have":[140],"expanded":[141],"multi-DGX-A100":[144],"environments":[145],"integrated":[147],"3D":[148],"parallelism,":[149],"achieving":[150],"near-linear":[151],"speedup":[152],"across":[153],"128":[156],"GPUs":[157],"token":[160],"throughput":[161,179],"by":[162],"1.4x":[163],"1.5x,":[165],"respectively.":[166],"The":[167],"experimental":[168],"results":[169],"demonstrate":[170],"proposed":[173],"has":[175],"achieved":[176],"highest":[178],"performance":[180],"compared":[181],"all":[183],"examined":[184],"state-of-the-art":[185],"frameworks.":[186]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-04-02T15:55:50.835912","created_date":"2025-10-10T00:00:00"}
