{"id":"https://openalex.org/W7127945923","doi":"https://doi.org/10.48550/arxiv.2602.05725","title":"Muon in Associative Memory Learning: Training Dynamics and Scaling Laws","display_name":"Muon in Associative Memory Learning: Training Dynamics and Scaling Laws","publication_year":2026,"publication_date":"2026-02-05","ids":{"openalex":"https://openalex.org/W7127945923","doi":"https://doi.org/10.48550/arxiv.2602.05725"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.05725","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125133993","display_name":"Binghui Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Binghui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125220028","display_name":"Kaifei Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Kaifei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125249086","display_name":"Han Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Han","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125153566","display_name":"Pinyan Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Pinyan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125130923","display_name":"Liwei Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Liwei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5125133993"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.10419999808073044,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.10419999808073044,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.07680000364780426,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.0706000030040741,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/muon","display_name":"Muon","score":0.6427000164985657},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6266999840736389},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.6189000010490417},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5188000202178955},{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.516700029373169},{"id":"https://openalex.org/keywords/content-addressable-memory","display_name":"Content-addressable memory","score":0.5077000260353088},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.46399998664855957},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.4620000123977661}],"concepts":[{"id":"https://openalex.org/C205334942","wikidata":"https://www.wikidata.org/wiki/Q3151","display_name":"Muon","level":2,"score":0.6427000164985657},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6266999840736389},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.6189000010490417},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5188000202178955},{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.516700029373169},{"id":"https://openalex.org/C53442348","wikidata":"https://www.wikidata.org/wiki/Q745101","display_name":"Content-addressable memory","level":3,"score":0.5077000260353088},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.46399998664855957},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.4620000123977661},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.45489999651908875},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.435699999332428},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.4104999899864197},{"id":"https://openalex.org/C139676723","wikidata":"https://www.wikidata.org/wiki/Q1193832","display_name":"Sign (mathematics)","level":2,"score":0.36079999804496765},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.3546000123023987},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.35350000858306885},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.34220001101493835},{"id":"https://openalex.org/C151376022","wikidata":"https://www.wikidata.org/wiki/Q168698","display_name":"Exponential function","level":2,"score":0.29350000619888306},{"id":"https://openalex.org/C121864883","wikidata":"https://www.wikidata.org/wiki/Q677916","display_name":"Statistical physics","level":1,"score":0.29249998927116394},{"id":"https://openalex.org/C170122806","wikidata":"https://www.wikidata.org/wiki/Q1914828","display_name":"Linear scale","level":2,"score":0.28780001401901245},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2782999873161316},{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.2759000062942505},{"id":"https://openalex.org/C195906000","wikidata":"https://www.wikidata.org/wiki/Q1191722","display_name":"Matrix exponential","level":3,"score":0.274399995803833},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.25290000438690186},{"id":"https://openalex.org/C159423971","wikidata":"https://www.wikidata.org/wiki/Q177251","display_name":"Associative property","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.05725","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.05725","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.05725","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.05725","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.7494874000549316,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Muon":[0,29,79,96,130,158],"updates":[1],"matrix":[2,6,137],"parameters":[3],"via":[4],"the":[5,9,78,93,104,150,182],"sign":[7,154],"of":[8],"gradient":[10,146],"and":[11,20,39,48,87,118,144,178],"has":[12],"shown":[13],"strong":[14],"empirical":[15],"gains,":[16],"yet":[17],"its":[18,120],"dynamics":[19],"scaling":[21,116,122],"behavior":[22],"remain":[23],"unclear":[24],"in":[25,30,92,103,171],"theory.":[26,183],"We":[27],"study":[28],"a":[31,40,108],"linear":[32],"associative":[33],"memory":[34],"model":[35],"with":[36,47,107,152],"softmax":[37],"retrieval":[38],"hierarchical":[41],"frequency":[42,62,110],"spectrum":[43],"over":[44,101,124],"query-answer":[45],"pairs,":[46],"without":[49],"label":[50],"noise.":[51],"In":[52,76,148],"this":[53,82],"setting,":[54],"we":[55,112,127],"show":[56,128],"that":[57,129],"Gradient":[58],"Descent":[59],"(GD)":[60],"learns":[61],"components":[63],"at":[64],"highly":[65],"imbalanced":[66],"rates,":[67],"leading":[68,84],"to":[69,85,162],"slow":[70],"convergence":[71],"bottlenecked":[72],"by":[73],"low-frequency":[74],"components.":[75],"contrast,":[77,149],"optimizer":[80],"mitigates":[81],"imbalance,":[83],"faster":[86],"more":[88],"uniform":[89],"progress.":[90],"Specifically,":[91],"noiseless":[94],"case,":[95],"achieves":[97],"an":[98,135],"exponential":[99],"speedup":[100],"GD;":[102],"noisy":[105],"case":[106],"power-decay":[109],"spectrum,":[111],"derive":[113],"Muon's":[114],"optimization":[115],"law":[117],"demonstrate":[119],"superior":[121],"efficiency":[123],"GD.":[125],"Furthermore,":[126],"can":[131],"be":[132],"interpreted":[133],"as":[134],"implicit":[136],"preconditioner":[138,151],"arising":[139],"from":[140],"adaptive":[141],"task":[142,164],"alignment":[143],"block-symmetric":[145],"structure.":[147],"coordinate-wise":[153],"operator":[155],"could":[156],"match":[157],"under":[159],"oracle":[160],"access":[161],"unknown":[163],"representations,":[165],"which":[166],"is":[167],"infeasible":[168],"for":[169],"SignGD":[170],"practice.":[172],"Experiments":[173],"on":[174],"synthetic":[175],"long-tail":[176],"classification":[177],"LLaMA-style":[179],"pre-training":[180],"corroborate":[181]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-07T00:00:00"}
