{"id":"https://openalex.org/W7138414785","doi":"https://doi.org/10.1609/aaai.v40i28.39549","title":"FedAdamW: A Communication-Efficient Optimizer with Convergence and Generalization Guarantees for Federated Large Models","display_name":"FedAdamW: A Communication-Efficient Optimizer with Convergence and Generalization Guarantees for Federated Large Models","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138414785","doi":"https://doi.org/10.1609/aaai.v40i28.39549"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i28.39549","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i28.39549","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i28.39549","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Junkang Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]},{"id":"https://openalex.org/I4210115513","display_name":"Xi\u2019an University","ror":"https://ror.org/01zzmf129","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210115513"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Junkang Liu","raw_affiliation_strings":["Tianjin University","Xi'an University of Electronic Science and Technology"],"affiliations":[{"raw_affiliation_string":"Tianjin University","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Xi'an University of Electronic Science and Technology","institution_ids":["https://openalex.org/I4210115513"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Fanhua Shang","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fanhua Shang","raw_affiliation_strings":["Tianjin University"],"affiliations":[{"raw_affiliation_string":"Tianjin University","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hongying Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongying Liu","raw_affiliation_strings":["Tianjin University"],"affiliations":[{"raw_affiliation_string":"Tianjin University","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yuxuan Tian","orcid":null},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuxuan Tian","raw_affiliation_strings":["Institute of automation, Chinese academy of science, Chinese Academy of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of automation, Chinese academy of science, Chinese Academy of Sciences","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yuanyuan Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuanyuan Liu","raw_affiliation_strings":["Xidian University"],"affiliations":[{"raw_affiliation_string":"Xidian University","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jin Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]},{"id":"https://openalex.org/I4210115513","display_name":"Xi\u2019an University","ror":"https://ror.org/01zzmf129","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210115513"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jin Liu","raw_affiliation_strings":["Tianjin University","Xi'an University of Electronic Science and Technology"],"affiliations":[{"raw_affiliation_string":"Tianjin University","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Xi'an University of Electronic Science and Technology","institution_ids":["https://openalex.org/I4210115513"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Kewen Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kewen Zhu","raw_affiliation_strings":["Tianjin University"],"affiliations":[{"raw_affiliation_string":"Tianjin University","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":null,"display_name":"Zhouchen Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhouchen Lin","raw_affiliation_strings":["Peking University,\nPazhou Laboratory (Huangpu)"],"affiliations":[{"raw_affiliation_string":"Peking University,\nPazhou Laboratory (Huangpu)","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I162868743","https://openalex.org/I4210115513"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.63436831,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"28","first_page":"23748","last_page":"23756"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.36579999327659607,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.36579999327659607,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.15539999306201935,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.10930000245571136,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.7551000118255615},{"id":"https://openalex.org/keywords/federated-learning","display_name":"Federated learning","score":0.5964000225067139},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5676000118255615},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.5408999919891357},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.5346999764442444},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5005000233650208},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4733000099658966},{"id":"https://openalex.org/keywords/rate-of-convergence","display_name":"Rate of convergence","score":0.4724000096321106},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.4494999945163727}],"concepts":[{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.7551000118255615},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7429999709129333},{"id":"https://openalex.org/C2992525071","wikidata":"https://www.wikidata.org/wiki/Q50818671","display_name":"Federated learning","level":2,"score":0.5964000225067139},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5676000118255615},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.5408999919891357},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.5346999764442444},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5005000233650208},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4733000099658966},{"id":"https://openalex.org/C57869625","wikidata":"https://www.wikidata.org/wiki/Q1783502","display_name":"Rate of convergence","level":3,"score":0.4724000096321106},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.4494999945163727},{"id":"https://openalex.org/C179254644","wikidata":"https://www.wikidata.org/wiki/Q13222844","display_name":"Moment (physics)","level":2,"score":0.3959999978542328},{"id":"https://openalex.org/C2779251273","wikidata":"https://www.wikidata.org/wiki/Q43436","display_name":"Pearl","level":2,"score":0.3873000144958496},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36480000615119934},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.35569998621940613},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.3531999886035919},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34709998965263367},{"id":"https://openalex.org/C169903167","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Test set","level":2,"score":0.33809998631477356},{"id":"https://openalex.org/C135320971","wikidata":"https://www.wikidata.org/wiki/Q1868524","display_name":"Local search (optimization)","level":2,"score":0.3059999942779541},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.29589998722076416},{"id":"https://openalex.org/C5465570","wikidata":"https://www.wikidata.org/wiki/Q5326898","display_name":"Early stopping","level":3,"score":0.2906000018119812},{"id":"https://openalex.org/C149629883","wikidata":"https://www.wikidata.org/wiki/Q660926","display_name":"Fraction (chemistry)","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C117765406","wikidata":"https://www.wikidata.org/wiki/Q5362437","display_name":"Generalization error","level":3,"score":0.2782000005245209},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2750999927520752},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26579999923706055},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2567000091075897}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i28.39549","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i28.39549","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i28.39549","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i28.39549","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"AdamW":[0,29,42,57,84],"has":[1],"become":[2],"one":[3],"of":[4,22,56,122,144,157,166,175,188,200],"the":[5,20,48,53,81,100,120,123,155,164,172,186,198],"most":[6],"effective":[7],"optimizers":[8],"for":[9,88],"training":[10,89],"large-scale":[11],"models.":[12,94,207],"We":[13,178],"have":[14],"also":[15,179],"observed":[16],"its":[17],"effectiveness":[18,187,199],"in":[19,30,47,192],"context":[21],"federated":[23,31],"learning":[24,32],"(FL).":[25],"However,":[26],"directly":[27],"applying":[28],"settings":[33],"poses":[34],"significant":[35],"challenges:":[36],"(1)":[37],"due":[38],"to":[39,113,126,184,209],"data":[40],"heterogeneity,":[41],"often":[43],"yields":[44],"high":[45],"variance":[46,129],"second-moment":[49,124],"estimate":[50],"v;":[51],"(2)":[52],"local":[54,97,106,115,167,193],"overfitting":[55],"may":[58],"cause":[59],"client":[60],"drift;":[61],"and":[62,90,109,130,169,204,217],"(3)":[63],"Reinitializing":[64],"moment":[65],"estimates":[66,125],"(v,":[67],"m)":[68],"at":[69],"each":[70],"round":[71],"slows":[72],"down":[73],"convergence.":[74],"To":[75],"address":[76],"these":[77],"challenges,":[78],"we":[79,134,196],"propose":[80],"first":[82],"Federated":[83],"algorithm,":[85],"called":[86],"FedAdamW,":[87],"fine-tuning":[91],"various":[92],"large":[93],"FedAdamW":[95,117,137,201,212],"aligns":[96],"updates":[98],"with":[99],"global":[101],"update":[102],"using":[103],"both":[104],"a":[105,139],"correction":[107],"mechanism":[108],"decoupled":[110,189],"weight":[111,190],"decay":[112,191],"mitigate":[114],"overfitting.":[116],"efficiently":[118],"aggregates":[119],"mean":[121],"reduce":[127],"their":[128],"reinitialize":[131],"them.":[132],"Theoretically,":[133],"prove":[135],"that":[136],"achieves":[138],"linear":[140],"speedup":[141],"convergence":[142],"rate":[143],"O\uff08p\uff08L\u2206\u03c32l":[145],"\uff09/\uff08SKR\u03b52\uff09":[146],"+":[147],"\uff08L\u2206\uff09/R\uff09":[148],"without":[149],"heterogeneity":[150],"assumption,":[151],"where":[152],"S":[153],"is":[154,163,171],"number":[156,165,174],"participating":[158],"clients":[159],"per":[160],"round,":[161],"K":[162],"iterations,":[168],"R":[170],"total":[173],"communication":[176,215],"rounds.":[177],"employ":[180],"PAC-Bayesian":[181],"generalization":[182],"analysis":[183],"explain":[185],"training.":[194],"Empirically,":[195],"validate":[197],"on":[202],"language":[203],"vision":[205],"Transformer":[206],"Compared":[208],"several":[210],"baselines,":[211],"significantly":[213],"reduces":[214],"rounds":[216],"improves":[218],"test":[219],"accuracy.":[220]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-02-02T00:00:00"}
