{"id":"https://openalex.org/W7131656391","doi":"https://doi.org/10.48550/arxiv.2602.21545","title":"MUON+: Towards More Effective Muon via One Additional Normalization Step for LLM Pre-training","display_name":"MUON+: Towards More Effective Muon via One Additional Normalization Step for LLM Pre-training","publication_year":2026,"publication_date":"2026-02-25","ids":{"openalex":"https://openalex.org/W7131656391","doi":"https://doi.org/10.48550/arxiv.2602.21545"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.21545","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126918790","display_name":"Ruijie Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ruijie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112593845","display_name":"Yequan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yequan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101933564","display_name":"Ziyue Liu","orcid":"https://orcid.org/0000-0001-9538-5350"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Zhengyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhengyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126935831","display_name":"Zheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Yupeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Tan, Liyan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Liyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Zhang, Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.6406999826431274,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.6406999826431274,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.05849999934434891,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10921","display_name":"Neutrino Physics Research","score":0.027699999511241913,"subfield":{"id":"https://openalex.org/subfields/3106","display_name":"Nuclear and High Energy Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.8134999871253967},{"id":"https://openalex.org/keywords/ranging","display_name":"Ranging","score":0.6765999794006348},{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.5659000277519226},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.4187999963760376},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.3869999945163727},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.33719998598098755}],"concepts":[{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.8134999871253967},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.6765999794006348},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.5659000277519226},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.544700026512146},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.45339998602867126},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.4187999963760376},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3869999945163727},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.33719998598098755},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.335999995470047},{"id":"https://openalex.org/C205334942","wikidata":"https://www.wikidata.org/wiki/Q3151","display_name":"Muon","level":2,"score":0.3310000002384186},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.32989999651908875},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.323199987411499},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.32120001316070557},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2791999876499176},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.2578999996185303}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.21545","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.21545","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.21545","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.21545","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Muon":[0,143],"has":[1],"recently":[2],"emerged":[3],"as":[4],"a":[5,82,98,103],"strong":[6],"optimizer":[7,113],"for":[8],"large":[9],"language":[10],"model":[11],"pre-training,":[12],"orthogonalizing":[13],"the":[14,30,45,62,67,78],"momentum":[15],"matrix":[16],"via":[17],"Newton--Schulz":[18],"polar":[19,26,57,108],"iterations.":[20],"A":[21],"natural":[22],"intuition":[23],"is":[24,51],"that":[25,49,74,101],"iterations,":[27],"by":[28,92],"flattening":[29],"singular":[31],"spectrum":[32],"to":[33,125,137,152],"all":[34],"ones,":[35],"should":[36],"also":[37],"eliminate":[38],"column-":[39],"and":[40,72,120,132,148],"row-wise":[41],"norm":[42],"imbalance":[43,76],"in":[44,54,81,144],"update.":[46],"We":[47,64],"show":[48],"this":[50,66,93],"not":[52],"true":[53],"practice:":[55],"practical":[56],"steps":[58],"can":[59],"substantially":[60],"amplify":[61],"imbalance.":[63],"term":[65,80],"post-polar":[68],"imbalanced":[69],"update":[70],"problem,":[71],"prove":[73],"such":[75],"tightens":[77],"second-order":[79],"blockwise":[83],"descent":[84,89],"analysis,":[85,94],"weakening":[86],"Muon's":[87],"per-step":[88],"guarantee.":[90],"Motivated":[91],"we":[95],"propose":[96],"Muon+,":[97],"one-line":[99],"fix":[100],"inserts":[102],"single":[104],"normalization":[105],"step":[106],"after":[107],"orthogonalization.":[109],"Muon+":[110,140],"adds":[111],"no":[112],"state.":[114],"Across":[115],"pre-training":[116,155],"experiments":[117],"on":[118],"GPT":[119],"LLaMA":[121],"models":[122],"from":[123],"60M":[124],"7B":[126],"parameters,":[127],"spanning":[128],"both":[129],"compute-optimal":[130],"budgets":[131],"extended":[133],"token-to-parameter":[134],"ratios":[135],"up":[136],"approximately":[138],"200,":[139],"consistently":[141],"outperforms":[142],"terms":[145],"of":[146],"training":[147],"validation":[149],"perplexity,":[150],"leading":[151],"significant":[153],"overall":[154],"speedup.":[156]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-27T00:00:00"}
