{"id":"https://openalex.org/W7147057124","doi":"https://doi.org/10.48550/arxiv.2603.28254","title":"MuonEq: Balancing Before Orthogonalization with Lightweight Equilibration","display_name":"MuonEq: Balancing Before Orthogonalization with Lightweight Equilibration","publication_year":2026,"publication_date":"2026-03-30","ids":{"openalex":"https://openalex.org/W7147057124","doi":"https://doi.org/10.48550/arxiv.2603.28254"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.28254","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.28254","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.28254","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112387709","display_name":"Da Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chang, Da","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132547967","display_name":"Qiankun Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Qiankun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132563718","display_name":"Lvgang Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Lvgang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132690587","display_name":"Yu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132671349","display_name":"Ruijie Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ruijie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132570164","display_name":"Yao Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Yao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132717575","display_name":"Yongxiang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yongxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5040011580","display_name":"Ganzhao Yuan","orcid":"https://orcid.org/0000-0002-2239-7315"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Ganzhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5112387709"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.43860000371932983,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.43860000371932983,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10048","display_name":"Particle physics theoretical and experimental studies","score":0.19769999384880066,"subfield":{"id":"https://openalex.org/subfields/3106","display_name":"Nuclear and High Energy Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10921","display_name":"Neutrino Physics Research","score":0.12200000137090683,"subfield":{"id":"https://openalex.org/subfields/3106","display_name":"Nuclear and High Energy Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/orthogonalization","display_name":"Orthogonalization","score":0.9797000288963318},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.9124000072479248},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.3864000141620636},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.3862000107765198},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.353300005197525}],"concepts":[{"id":"https://openalex.org/C47559304","wikidata":"https://www.wikidata.org/wiki/Q1702189","display_name":"Orthogonalization","level":2,"score":0.9797000288963318},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.9124000072479248},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.5},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4952999949455261},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.43140000104904175},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.3864000141620636},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.3862000107765198},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.353300005197525},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.30410000681877136},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.30219998955726624},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.3003000020980835},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.2766000032424927},{"id":"https://openalex.org/C27438332","wikidata":"https://www.wikidata.org/wiki/Q2873","display_name":"Principal component analysis","level":2,"score":0.275299996137619},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.26460000872612}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.28254","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.28254","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.28254","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.28254","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Orthogonalized-update":[0],"optimizers":[1],"such":[2],"as":[3],"Muon":[4,39,139],"improve":[5],"training":[6],"of":[7,34,125],"matrix-valued":[8],"parameters,":[9],"but":[10],"existing":[11],"extensions":[12],"mostly":[13],"act":[14],"either":[15],"after":[16],"orthogonalization":[17,76],"by":[18,79,109],"rescaling":[19],"updates":[20],"or":[21],"before":[22,60],"it":[23],"with":[24],"heavier":[25],"whitening-based":[26],"preconditioners.":[27],"We":[28,72],"introduce":[29],"{\\method},":[30,110],"a":[31,94],"lightweight":[32],"family":[33],"pre-orthogonalization":[35],"equilibration":[36],"schemes":[37],"for":[38],"in":[40],"three":[41],"forms:":[42],"two-sided":[43],"row/column":[44,64,91],"normalization":[45,48,52,92],"(RC),":[46],"row":[47],"(R),":[49],"and":[50,67,86,89,119,142,148],"column":[51],"(C).":[53],"These":[54],"variants":[55],"rebalance":[56],"the":[57,104,111,116,121,133],"momentum":[58],"matrix":[59,106],"finite-step":[61,75],"Newton--Schulz":[62],"using":[63],"squared-norm":[65],"statistics":[66],"only":[68],"$\\mathcal{O}(m+n)$":[69],"auxiliary":[70],"state.":[71],"show":[73],"that":[74,90,98],"is":[77,93,115],"governed":[78],"input":[80],"spectral":[81],"properties,":[82],"especially":[83],"stable":[84],"rank":[85],"condition":[87],"number,":[88],"zeroth-order":[95],"whitening":[96],"surrogate":[97],"removes":[99],"marginal":[100],"scale":[101],"mismatch.":[102],"For":[103],"hidden":[105],"weights":[107],"targeted":[108],"row-normalized":[112],"variant":[113,136],"R":[114,135],"natural":[117],"default":[118,134],"preserves":[120],"$\\widetilde{\\mathcal{O}}(T^{-1/4})$":[122],"stationarity":[123],"guarantee":[124],"Muon-type":[126],"methods.":[127],"In":[128],"LLaMA2":[129],"pretraining":[130],"on":[131,140],"C4,":[132],"consistently":[137],"outperforms":[138],"130M":[141],"350M":[143],"models,":[144],"yielding":[145],"faster":[146],"convergence":[147],"lower":[149],"validation":[150],"perplexity.":[151]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-02T00:00:00"}
