{"id":"https://openalex.org/W7160599108","doi":"https://doi.org/10.48550/arxiv.2605.06654","title":"Optimizer-Model Consistency: Full Finetuning with the Same Optimizer as Pretraining Forgets Less","display_name":"Optimizer-Model Consistency: Full Finetuning with the Same Optimizer as Pretraining Forgets Less","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160599108","doi":"https://doi.org/10.48550/arxiv.2605.06654"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.06654","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06654","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.06654","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135641239","display_name":"Yuxing Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yuxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135699003","display_name":"Jianyu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jianyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135650848","display_name":"Tong Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Tong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.2206999957561493,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.2206999957561493,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.08479999750852585,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.07559999823570251,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/forgetting","display_name":"Forgetting","score":0.6948000192642212},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.6870999932289124},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5479999780654907},{"id":"https://openalex.org/keywords/minification","display_name":"Minification","score":0.32690000534057617},{"id":"https://openalex.org/keywords/term","display_name":"Term (time)","score":0.32440000772476196}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8127999901771545},{"id":"https://openalex.org/C7149132","wikidata":"https://www.wikidata.org/wiki/Q1377840","display_name":"Forgetting","level":2,"score":0.6948000192642212},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.6870999932289124},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5688999891281128},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5479999780654907},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.47360000014305115},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.32440000772476196},{"id":"https://openalex.org/C154556556","wikidata":"https://www.wikidata.org/wiki/Q192969","display_name":"Computer multitasking","level":2,"score":0.3068999946117401},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.2583000063896179},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.06654","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06654","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.06654","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06654","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.4262881875038147,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Optimizers":[0],"play":[1],"an":[2,22],"important":[3],"role":[4],"in":[5,32,107,116,130],"both":[6],"pretraining":[7,33,154],"and":[8,79,146,155,158],"finetuning":[9,26,63],"stages":[10,157],"when":[11,148,164],"training":[12],"large":[13],"language":[14,172],"models":[15,90],"(LLMs).":[16],"In":[17],"this":[18,68,110,178],"paper,":[19],"we":[20,82,142,175],"present":[21],"observation":[23],"that":[24,160,177],"full":[25],"with":[27,193],"the":[28,44,50,61,89,96,103,113,127,138,153],"same":[29,45,139],"optimizer":[30],"as":[31,199],"achieves":[34],"a":[35,170,194],"better":[36,47,73],"learning-forgetting":[37],"tradeoff,":[38],"i.e.,":[39],"forgetting":[40,125],"less":[41],"while":[42],"achieving":[43],"or":[46],"performance":[48],"on":[49,95],"new":[51],"task,":[52],"than":[53],"other":[54],"optimizers":[55,86],"and,":[56],"possibly":[57],"surprisingly,":[58],"LoRA,":[59],"during":[60],"supervised":[62],"(SFT)":[64],"stage.":[65],"We":[66],"term":[67],"phenomenon":[69],"optimizer-model":[70],"consistency.":[71],"To":[72],"understand":[74],"it,":[75],"through":[76],"controlled":[77],"experiments":[78],"theoretical":[80],"analysis,":[81],"show":[83],"that:":[84],"1)":[85],"can":[87,133,179],"shape":[88],"by":[91,136],"having":[92],"regularization":[93,111],"effects":[94],"activations,":[97],"leading":[98],"to":[99,109,123],"different":[100],"landscapes":[101],"around":[102],"pretrained":[104],"checkpoints;":[105],"2)":[106],"response":[108],"effect,":[112],"weight":[114],"update":[115],"SFT":[117,156],"should":[118],"follow":[119],"some":[120],"specific":[121],"structures":[122],"lower":[124],"of":[126,197],"knowledge":[128],"learned":[129],"pretraining,":[131],"which":[132,188],"be":[134],"obtained":[135],"using":[137],"optimizer.":[140],"Moreover,":[141],"specifically":[143],"compare":[144],"Muon":[145,161],"AdamW":[147],"they":[149],"are":[150],"employed":[151],"throughout":[152],"find":[159],"performs":[162],"worse":[163],"finetuned":[165],"for":[166,200],"reasoning":[167],"tasks.":[168],"With":[169],"synthetic":[171],"modeling":[173],"experiment,":[174],"demonstrate":[176],"come":[180],"from":[181],"Muon's":[182],"strong":[183],"tendency":[184],"towards":[185],"rote":[186],"memorization,":[187],"may":[189],"hurt":[190],"pattern":[191],"acquisition":[192],"small":[195],"amount":[196],"data,":[198],"SFT.":[201]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-09T00:00:00"}
