{"id":"https://openalex.org/W7130733643","doi":"https://doi.org/10.48550/arxiv.2602.17080","title":"Adam Improves Muon: Adaptive Moment Estimation with Orthogonalized Momentum","display_name":"Adam Improves Muon: Adaptive Moment Estimation with Orthogonalized Momentum","publication_year":2026,"publication_date":"2026-02-19","ids":{"openalex":"https://openalex.org/W7130733643","doi":"https://doi.org/10.48550/arxiv.2602.17080"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.17080","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126514841","display_name":"Minxin Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Minxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053232704","display_name":"Yuxuan Liu","orcid":"https://orcid.org/0009-0002-9684-6416"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yuxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Schaeffer, Hayden","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schaeffer, Hayden","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.13779999315738678,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.13779999315738678,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.1216999962925911,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10048","display_name":"Particle physics theoretical and experimental studies","score":0.08389999717473984,"subfield":{"id":"https://openalex.org/subfields/3106","display_name":"Nuclear and High Energy Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hessian-matrix","display_name":"Hessian matrix","score":0.683899998664856},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5738000273704529},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.5314000248908997},{"id":"https://openalex.org/keywords/momentum","display_name":"Momentum (technical analysis)","score":0.5116999745368958},{"id":"https://openalex.org/keywords/moment","display_name":"Moment (physics)","score":0.4602999985218048},{"id":"https://openalex.org/keywords/orthogonality","display_name":"Orthogonality","score":0.45820000767707825},{"id":"https://openalex.org/keywords/diagonal","display_name":"Diagonal","score":0.41760000586509705},{"id":"https://openalex.org/keywords/rate-of-convergence","display_name":"Rate of convergence","score":0.32679998874664307}],"concepts":[{"id":"https://openalex.org/C203616005","wikidata":"https://www.wikidata.org/wiki/Q620495","display_name":"Hessian matrix","level":2,"score":0.683899998664856},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5738000273704529},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.5314000248908997},{"id":"https://openalex.org/C60718061","wikidata":"https://www.wikidata.org/wiki/Q1414747","display_name":"Momentum (technical analysis)","level":2,"score":0.5116999745368958},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.4763000011444092},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.46070000529289246},{"id":"https://openalex.org/C179254644","wikidata":"https://www.wikidata.org/wiki/Q13222844","display_name":"Moment (physics)","level":2,"score":0.4602999985218048},{"id":"https://openalex.org/C17137986","wikidata":"https://www.wikidata.org/wiki/Q215067","display_name":"Orthogonality","level":2,"score":0.45820000767707825},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.45179998874664307},{"id":"https://openalex.org/C130367717","wikidata":"https://www.wikidata.org/wiki/Q189791","display_name":"Diagonal","level":2,"score":0.41760000586509705},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.37779998779296875},{"id":"https://openalex.org/C57869625","wikidata":"https://www.wikidata.org/wiki/Q1783502","display_name":"Rate of convergence","level":3,"score":0.32679998874664307},{"id":"https://openalex.org/C121864883","wikidata":"https://www.wikidata.org/wiki/Q677916","display_name":"Statistical physics","level":1,"score":0.3255999982357025},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C4199805","wikidata":"https://www.wikidata.org/wiki/Q2725903","display_name":"Gaussian noise","level":2,"score":0.3050000071525574},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2969000041484833},{"id":"https://openalex.org/C129844170","wikidata":"https://www.wikidata.org/wiki/Q41299","display_name":"Quadratic equation","level":2,"score":0.29260000586509705},{"id":"https://openalex.org/C8272713","wikidata":"https://www.wikidata.org/wiki/Q176737","display_name":"Stochastic process","level":2,"score":0.2808000147342682},{"id":"https://openalex.org/C207658827","wikidata":"https://www.wikidata.org/wiki/Q1999781","display_name":"Stochastic resonance","level":4,"score":0.2752000093460083},{"id":"https://openalex.org/C64341305","wikidata":"https://www.wikidata.org/wiki/Q4919225","display_name":"Bivariate analysis","level":2,"score":0.26739999651908875},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.2615000009536743},{"id":"https://openalex.org/C55479107","wikidata":"https://www.wikidata.org/wiki/Q97663916","display_name":"Stochastic approximation","level":3,"score":0.2581999897956848},{"id":"https://openalex.org/C34862557","wikidata":"https://www.wikidata.org/wiki/Q178985","display_name":"Ode","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C8642999","wikidata":"https://www.wikidata.org/wiki/Q4171168","display_name":"Hyperparameter","level":2,"score":0.251800000667572},{"id":"https://openalex.org/C35651441","wikidata":"https://www.wikidata.org/wiki/Q625303","display_name":"Independence (probability theory)","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.17080","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.17080","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.17080","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.17080","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Efficient":[0],"stochastic":[1,20,140,151],"optimization":[2],"typically":[3],"integrates":[4],"an":[5,181],"update":[6,194],"direction":[7,195],"that":[8,185],"performs":[9],"well":[10],"in":[11,44,131,138],"the":[12,33,62,114,132,139,147,168,187],"deterministic":[13,133],"regime":[14],"with":[15,69,102,113,173],"a":[16,51,55,79,99,192],"mechanism":[17],"adapting":[18],"to":[19,28,146,167],"perturbations.":[21],"While":[22],"Adam":[23],"uses":[24],"adaptive":[25,81],"moment":[26],"estimates":[27],"promote":[29],"stability,":[30],"Muon":[31,88,171],"utilizes":[32],"weight":[34],"layers'":[35],"matrix":[36,101],"structure":[37],"via":[38,180],"orthogonalized":[39,67,76,96],"momentum,":[40],"showing":[41],"superior":[42],"performance":[43,160],"large":[45],"language":[46],"model":[47],"training.":[48],"We":[49],"propose":[50],"new":[52],"optimizer":[53],"and":[54,59,111,135,164,170,196],"diagonal":[56,100],"extension,":[57],"NAMO":[58,74,163,179],"NAMO-D,":[60],"providing":[61],"first":[63],"principled":[64],"integration":[65],"of":[66,150,161,190],"momentum":[68,77,97],"norm-based":[70],"Adam-type":[71],"noise":[72,109,148,199],"adaptation.":[73,200],"scales":[75],"using":[78],"single":[80],"stepsize,":[82],"preserving":[83],"orthogonality":[84],"while":[85],"improving":[86],"upon":[87],"at":[89],"negligible":[90],"additional":[91,182],"cost.":[92],"NAMO-D":[93,165,174],"instead":[94],"right-multiplies":[95],"by":[98],"clamped":[103],"entries.":[104],"This":[105],"design":[106],"enables":[107],"neuron-wise":[108],"adaptation":[110],"aligns":[112],"common":[115],"near":[116],"block-diagonal":[117],"Hessian":[118],"structure.":[119],"Under":[120],"standard":[121],"assumptions,":[122],"we":[123],"establish":[124],"optimal":[125],"convergence":[126,143],"rates":[127],"for":[128],"both":[129,162],"algorithms":[130],"setting":[134],"show":[136],"that,":[137],"setting,":[141],"their":[142],"guarantees":[144],"adapt":[145],"level":[149],"gradients.":[152],"Experiments":[153],"on":[154],"pretraining":[155],"GPT-2":[156],"models":[157],"demonstrate":[158],"improved":[159],"compared":[166],"AdamW":[169],"baselines,":[172],"achieving":[175],"further":[176],"gains":[177],"over":[178],"clamping":[183],"hyperparameter":[184],"balances":[186],"competing":[188],"goals":[189],"maintaining":[191],"well-conditioned":[193],"leveraging":[197],"fine-grained":[198]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-21T00:00:00"}
