{"id":"https://openalex.org/W4415250134","doi":"https://doi.org/10.48550/arxiv.2509.15816","title":"On the Convergence of Muon and Beyond","display_name":"On the Convergence of Muon and Beyond","publication_year":2025,"publication_date":"2025-09-19","ids":{"openalex":"https://openalex.org/W4415250134","doi":"https://doi.org/10.48550/arxiv.2509.15816"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2509.15816","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.15816","pdf_url":"https://arxiv.org/pdf/2509.15816","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2509.15816","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112387709","display_name":"Da Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chang, Da","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054002392","display_name":"Yongxiang Liu","orcid":"https://orcid.org/0000-0002-0682-8365"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yongxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5040011580","display_name":"Ganzhao Yuan","orcid":"https://orcid.org/0000-0002-2239-7315"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Ganzhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5112387709"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10048","display_name":"Particle physics theoretical and experimental studies","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/3106","display_name":"Nuclear and High Energy Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10048","display_name":"Particle physics theoretical and experimental studies","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/3106","display_name":"Nuclear and High Energy Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.932200014591217,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.9121999740600586,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.718500018119812},{"id":"https://openalex.org/keywords/muon","display_name":"Muon","score":0.7032999992370605},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5471000075340271},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.5393999814987183},{"id":"https://openalex.org/keywords/variance-reduction","display_name":"Variance reduction","score":0.53329998254776},{"id":"https://openalex.org/keywords/path","display_name":"Path (computing)","score":0.5004000067710876},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.4797999858856201},{"id":"https://openalex.org/keywords/upper-and-lower-bounds","display_name":"Upper and lower bounds","score":0.4066999852657318}],"concepts":[{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.718500018119812},{"id":"https://openalex.org/C205334942","wikidata":"https://www.wikidata.org/wiki/Q3151","display_name":"Muon","level":2,"score":0.7032999992370605},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5471000075340271},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.5393999814987183},{"id":"https://openalex.org/C62644790","wikidata":"https://www.wikidata.org/wiki/Q3454689","display_name":"Variance reduction","level":3,"score":0.53329998254776},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.5004000067710876},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.4797999858856201},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.47290000319480896},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.4108999967575073},{"id":"https://openalex.org/C77553402","wikidata":"https://www.wikidata.org/wiki/Q13222579","display_name":"Upper and lower bounds","level":2,"score":0.4066999852657318},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.37720000743865967},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.35409998893737793},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.3463999927043915},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3312000036239624},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.32910001277923584},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.3181999921798706},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.3140999972820282},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C121864883","wikidata":"https://www.wikidata.org/wiki/Q677916","display_name":"Statistical physics","level":1,"score":0.2750000059604645},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C88757350","wikidata":"https://www.wikidata.org/wiki/Q1557613","display_name":"L\u00e9vy process","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C8272713","wikidata":"https://www.wikidata.org/wiki/Q176737","display_name":"Stochastic process","level":2,"score":0.26440000534057617},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.2549000084400177},{"id":"https://openalex.org/C206654554","wikidata":"https://www.wikidata.org/wiki/Q5374247","display_name":"Empirical measure","level":2,"score":0.2526000142097473}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2509.15816","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.15816","pdf_url":"https://arxiv.org/pdf/2509.15816","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2509.15816","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.15816","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2509.15816","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.15816","pdf_url":"https://arxiv.org/pdf/2509.15816","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4415250134.pdf","grobid_xml":"https://content.openalex.org/works/W4415250134.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"Muon":[1,33,141],"optimizer":[2],"has":[3],"demonstrated":[4],"remarkable":[5],"empirical":[6],"success":[7],"in":[8,43],"handling":[9],"matrix-structured":[10],"parameters":[11],"for":[12,101,116],"training":[13],"neural":[14],"networks.":[15],"However,":[16],"a":[17,37],"significant":[18],"gap":[19],"remains":[20],"between":[21],"its":[22],"practical":[23,135],"performance":[24],"and":[25,70,131],"theoretical":[26,57],"understanding.":[27],"Existing":[28],"analyses":[29],"show":[30],"that":[31],"the":[32,50,56,67,71,76,91,98,106,117,134,138],"variants":[34],"achieve":[35],"only":[36],"suboptimal":[38],"iteration":[39],"complexity":[40],"of":[41,52,59,137],"$\\mathcal{O}(T^{-1/4})$":[42],"stochastic":[44],"non-convex":[45],"settings,":[46],"where":[47],"$T$":[48],"denotes":[49],"number":[51],"iterations.":[53],"To":[54],"study":[55],"limits":[58],"Muon,":[60],"we":[61,110],"analyze":[62],"two":[63],"momentum-based":[64],"variance-reduced":[65,140],"variants:":[66],"one-batch":[68],"Muon-MVR1":[69,121],"two-batch":[72],"Muon-MVR2.":[73],"We":[74],"provide":[75],"first":[77],"rigorous":[78],"proof":[79],"that,":[80],"under":[81],"horizon-free":[82],"learning-rate":[83],"schedules,":[84],"variance":[85],"reduction":[86],"enables":[87],"Muon-MVR2":[88,125],"to":[89],"attain":[90],"optimal":[92],"anytime":[93,113],"convergence":[94],"rate":[95],"$\\tilde{\\mathcal{O}}(T^{-1/3})$,":[96],"matching":[97],"lower":[99],"bound":[100],"this":[102],"problem":[103],"class.":[104],"Under":[105],"Polyak--\u0141ojasiewicz":[107],"(PL)":[108],"condition,":[109],"further":[111],"establish":[112],"best-iterate":[114],"guarantees":[115],"expected":[118],"square-root":[119],"suboptimality:":[120],"achieves":[122,126],"$\\widetilde{\\mathcal{O}}(T^{-1/4})$,":[123],"while":[124],"$\\widetilde{\\mathcal{O}}(T^{-1/3})$.":[127],"Experiments":[128],"on":[129],"CIFAR-10":[130],"C4":[132],"support":[133],"effectiveness":[136],"proposed":[139],"variants.":[142]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-16T00:00:00"}
