{"id":"https://openalex.org/W7154373369","doi":"https://doi.org/10.48550/arxiv.2604.09967","title":"Muon$^2$: Boosting Muon via Adaptive Second-Moment Preconditioning","display_name":"Muon$^2$: Boosting Muon via Adaptive Second-Moment Preconditioning","publication_year":2026,"publication_date":"2026-04-11","ids":{"openalex":"https://openalex.org/W7154373369","doi":"https://doi.org/10.48550/arxiv.2604.09967"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.09967","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09967","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.09967","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101933564","display_name":"Ziyue Liu","orcid":"https://orcid.org/0000-0001-9538-5350"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Ziyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133610034","display_name":"Ruijie Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ruijie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133568408","display_name":"Zhengyang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhengyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112593845","display_name":"Yequan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yequan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124857835","display_name":"Yupeng Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Yupeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133591798","display_name":"Zi Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133619061","display_name":"Zheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5101933564"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.3043000102043152,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.3043000102043152,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10048","display_name":"Particle physics theoretical and experimental studies","score":0.14180000126361847,"subfield":{"id":"https://openalex.org/subfields/3106","display_name":"Nuclear and High Energy Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.08410000056028366,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/muon","display_name":"Muon","score":0.7656000256538391},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5928000211715698},{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.5863999724388123},{"id":"https://openalex.org/keywords/orthogonalization","display_name":"Orthogonalization","score":0.5116000175476074},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.4397999942302704},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.38420000672340393},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.34630000591278076},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.3353999853134155},{"id":"https://openalex.org/keywords/momentum","display_name":"Momentum (technical analysis)","score":0.32580000162124634}],"concepts":[{"id":"https://openalex.org/C205334942","wikidata":"https://www.wikidata.org/wiki/Q3151","display_name":"Muon","level":2,"score":0.7656000256538391},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5928000211715698},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.5863999724388123},{"id":"https://openalex.org/C47559304","wikidata":"https://www.wikidata.org/wiki/Q1702189","display_name":"Orthogonalization","level":2,"score":0.5116000175476074},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4708000123500824},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.4397999942302704},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.41760000586509705},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.38420000672340393},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.34630000591278076},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.34130001068115234},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.3353999853134155},{"id":"https://openalex.org/C60718061","wikidata":"https://www.wikidata.org/wiki/Q1414747","display_name":"Momentum (technical analysis)","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C109214941","wikidata":"https://www.wikidata.org/wiki/Q18334","display_name":"Particle physics","level":1,"score":0.32580000162124634},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3181000053882599},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3075000047683716},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.3043999969959259},{"id":"https://openalex.org/C159694833","wikidata":"https://www.wikidata.org/wiki/Q2321565","display_name":"Iterative method","level":2,"score":0.2994000017642975},{"id":"https://openalex.org/C184720557","wikidata":"https://www.wikidata.org/wiki/Q7825049","display_name":"Topology (electrical circuits)","level":2,"score":0.29089999198913574},{"id":"https://openalex.org/C2779072382","wikidata":"https://www.wikidata.org/wiki/Q2937690","display_name":"Muon capture","level":3,"score":0.2874999940395355},{"id":"https://openalex.org/C2781204021","wikidata":"https://www.wikidata.org/wiki/Q6497091","display_name":"Lattice (music)","level":2,"score":0.2808000147342682},{"id":"https://openalex.org/C156778621","wikidata":"https://www.wikidata.org/wiki/Q1365748","display_name":"Spectrum (functional analysis)","level":2,"score":0.2800999879837036},{"id":"https://openalex.org/C111030470","wikidata":"https://www.wikidata.org/wiki/Q1430460","display_name":"Curse of dimensionality","level":2,"score":0.2680000066757202},{"id":"https://openalex.org/C57869625","wikidata":"https://www.wikidata.org/wiki/Q1783502","display_name":"Rate of convergence","level":3,"score":0.265500009059906},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.2551000118255615}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.09967","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09967","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.09967","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09967","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Muon":[0,54,75,117,136,139],"has":[1],"emerged":[2],"as":[3],"a":[4,96,151],"promising":[5],"optimizer":[6],"for":[7,33],"large-scale":[8],"foundation":[9],"model":[10],"pre-training":[11,126],"by":[12,30,89,145],"exploiting":[13],"the":[14,31,68,78,84,103,159],"matrix":[15],"structure":[16],"of":[17,53,71,82,158,161],"neural":[18],"network":[19],"updates":[20],"through":[21],"iterative":[22],"orthogonalization.":[23,62,99],"However,":[24],"its":[25],"practical":[26,104],"efficiency":[27],"is":[28,66,86],"limited":[29],"need":[32],"multiple":[34],"Newton--Schulz":[35],"(NS)":[36],"iterations":[37,144],"per":[38],"optimization":[39],"step,":[40],"which":[41,83,111],"introduces":[42],"non-trivial":[43],"computation":[44],"and":[45,124,137],"communication":[46],"overhead.":[47,166],"We":[48,100,147],"propose":[49],"Muon$^2$,":[50,90],"an":[51],"extension":[52],"that":[55,67,155],"applies":[56],"Adam-style":[57],"adaptive":[58],"second-moment":[59],"preconditioning":[60],"before":[61],"Our":[63],"key":[64],"insight":[65],"core":[69],"challenge":[70],"polar":[72,120],"approximation":[73],"in":[74,77],"lies":[76],"ill-conditioned":[79],"momentum":[80],"matrix,":[81],"spectrum":[85],"substantially":[87],"improved":[88],"leading":[91],"to":[92,130],"faster":[93],"convergence":[94],"toward":[95],"practically":[97],"sufficient":[98],"further":[101,148],"characterize":[102],"orthogonalization":[105],"quality":[106],"via":[107],"directional":[108],"alignment,":[109],"under":[110],"Muon$^2$":[112,133,162],"demonstrates":[113],"dramatic":[114],"improvement":[115],"over":[116],"at":[118],"each":[119],"step.":[121],"Across":[122],"GPT":[123],"LLaMA":[125],"experiments":[127],"from":[128],"60M":[129],"1.3B":[131],"parameters,":[132],"consistently":[134],"outperforms":[135],"recent":[138],"variants":[140],"while":[141],"reducing":[142],"NS":[143],"40\\%.":[146],"introduce":[149],"Muon$^2$-F,":[150],"memory-efficient":[152],"factorized":[153],"variant":[154],"preserves":[156],"most":[157],"gains":[160],"with":[163],"negligible":[164],"memory":[165]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-15T00:00:00"}
