{"id":"https://openalex.org/W7161052638","doi":"https://doi.org/10.48550/arxiv.2605.11396","title":"MuonQ: Enhancing Low-Bit Muon Quantization via Directional Fidelity Optimization","display_name":"MuonQ: Enhancing Low-Bit Muon Quantization via Directional Fidelity Optimization","publication_year":2026,"publication_date":"2026-05-12","ids":{"openalex":"https://openalex.org/W7161052638","doi":"https://doi.org/10.48550/arxiv.2605.11396"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.11396","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.11396","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124857835","display_name":"Yupeng Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Yupeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136060310","display_name":"Ruijie Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ruijie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101933564","display_name":"Ziyue Liu","orcid":"https://orcid.org/0000-0001-9538-5350"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112593845","display_name":"Yequan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yequan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136073209","display_name":"Zheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.39559999108314514,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12579","display_name":"Muon and positron interactions and applications","score":0.39559999108314514,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.03440000116825104,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10921","display_name":"Neutrino Physics Research","score":0.032499998807907104,"subfield":{"id":"https://openalex.org/subfields/3106","display_name":"Nuclear and High Energy Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.7312999963760376},{"id":"https://openalex.org/keywords/vector-quantization","display_name":"Vector quantization","score":0.545199990272522},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.4334000051021576},{"id":"https://openalex.org/keywords/singular-value-decomposition","display_name":"Singular value decomposition","score":0.4293999969959259},{"id":"https://openalex.org/keywords/orthogonalization","display_name":"Orthogonalization","score":0.38749998807907104},{"id":"https://openalex.org/keywords/round-off-error","display_name":"Round-off error","score":0.3684000074863434},{"id":"https://openalex.org/keywords/hypersphere","display_name":"Hypersphere","score":0.3562000095844269},{"id":"https://openalex.org/keywords/linde\u2013buzo\u2013gray-algorithm","display_name":"Linde\u2013Buzo\u2013Gray algorithm","score":0.3240000009536743}],"concepts":[{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.7312999963760376},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5741999745368958},{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.545199990272522},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4596000015735626},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.4334000051021576},{"id":"https://openalex.org/C22789450","wikidata":"https://www.wikidata.org/wiki/Q420904","display_name":"Singular value decomposition","level":2,"score":0.4293999969959259},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.4124000072479248},{"id":"https://openalex.org/C47559304","wikidata":"https://www.wikidata.org/wiki/Q1702189","display_name":"Orthogonalization","level":2,"score":0.38749998807907104},{"id":"https://openalex.org/C61005703","wikidata":"https://www.wikidata.org/wiki/Q2145211","display_name":"Round-off error","level":2,"score":0.3684000074863434},{"id":"https://openalex.org/C2776562905","wikidata":"https://www.wikidata.org/wiki/Q306610","display_name":"Hypersphere","level":2,"score":0.3562000095844269},{"id":"https://openalex.org/C93372532","wikidata":"https://www.wikidata.org/wiki/Q6552455","display_name":"Linde\u2013Buzo\u2013Gray algorithm","level":3,"score":0.3240000009536743},{"id":"https://openalex.org/C109282560","wikidata":"https://www.wikidata.org/wiki/Q4166054","display_name":"Singular value","level":3,"score":0.31850001215934753},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.3095000088214874},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C47446073","wikidata":"https://www.wikidata.org/wiki/Q5165890","display_name":"Control theory (sociology)","level":3,"score":0.2628999948501587},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2599000036716461},{"id":"https://openalex.org/C53533937","wikidata":"https://www.wikidata.org/wiki/Q185020","display_name":"Histogram","level":3,"score":0.2515999972820282},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.11396","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.11396","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"Muon":[1,68,188],"optimizer":[2,25,170,199],"has":[3],"emerged":[4],"as":[5],"a":[6,66,82,102,108],"compelling":[7],"alternative":[8],"to":[9,30,142,146,158,204],"Adam":[10],"for":[11],"training":[12,69,191],"large":[13],"language":[14],"models,":[15],"achieving":[16],"remarkable":[17],"computational":[18],"savings":[19],"through":[20],"gradient":[21],"orthogonalization.":[22],"However,":[23],"Muon's":[24,169],"state":[26,200],"is":[27,209],"more":[28],"sensitive":[29],"quantization":[31,49,90,123,141,153,167],"errors:":[32],"because":[33],"the":[34,37,58,73,93,97,114,152],"orthogonalization":[35],"discards":[36],"magnitudes":[38,129],"of":[39,75,92,168],"singular":[40,52,116,127,133],"values":[41],"and":[42,176,193],"retains":[43],"only":[44,126],"directional":[45,76],"information,":[46],"even":[47],"small":[48],"errors":[50,91,124],"in":[51,57,189],"vector":[53,134],"directions":[54],"are":[55],"amplified":[56],"update.":[59],"In":[60],"this":[61],"work,":[62],"we":[63,80,106,137],"propose":[64],"MuonQ,":[65],"low-bit":[67],"framework":[70],"built":[71],"on":[72,174],"principle":[74],"fidelity":[77],"optimization.":[78],"First,":[79],"apply":[81],"pre-quantization":[83],"normalization":[84],"so":[85],"that":[86,111,122,180],"each":[87],"step":[88],"introduces":[89],"same":[94],"magnitude,":[95],"preventing":[96],"accumulated":[98],"error":[99],"from":[100,155],"developing":[101],"preferred":[103],"direction.":[104],"Second,":[105],"introduce":[107],"structural":[109],"decomposition":[110],"separately":[112],"quantizes":[113],"dominant":[115],"components":[117],"via":[118],"power":[119],"iteration,":[120],"ensuring":[121],"perturb":[125],"value":[128],"rather":[130],"than":[131],"rotating":[132],"directions.":[135],"Third,":[136],"adopt":[138],"$\u03bc$-law":[139],"companding":[140],"allocate":[143],"higher":[144],"resolution":[145],"densely":[147],"packed":[148],"momentum":[149],"values,":[150],"shifting":[151],"objective":[154],"outlier":[156],"preservation":[157],"dense-region":[159],"distinguishability.":[160],"Together,":[161],"these":[162],"techniques":[163],"enable":[164],"stable":[165],"4-bit":[166,183],"states.":[171],"Pre-training":[172],"experiments":[173],"GPT-style":[175],"LLaMA-style":[177],"models":[178],"demonstrate":[179],"MuonQ":[181],"at":[182,211],"precision":[184],"closely":[185],"matches":[186],"full-precision":[187],"both":[190],"loss":[192],"downstream":[194],"task":[195],"accuracy,":[196],"while":[197],"reducing":[198],"memory":[201],"by":[202],"up":[203],"7.3":[205],"$\\times$.":[206],"Our":[207],"code":[208],"available":[210],"https://github.com/YupengSu/MuonQ.":[212]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-14T00:00:00"}
