{"id":"https://openalex.org/W7127151387","doi":"https://doi.org/10.48550/arxiv.2601.23000","title":"Mano: Restriking Manifold Optimization for LLM Training","display_name":"Mano: Restriking Manifold Optimization for LLM Training","publication_year":2026,"publication_date":"2026-01-30","ids":{"openalex":"https://openalex.org/W7127151387","doi":"https://doi.org/10.48550/arxiv.2601.23000"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.23000","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124821616","display_name":"Yufei Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Yufei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124842349","display_name":"Zeke Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Zeke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.21449999511241913,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.21449999511241913,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.10279999673366547,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.05460000038146973,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/manifold","display_name":"Manifold (fluid mechanics)","score":0.49149999022483826},{"id":"https://openalex.org/keywords/curvature","display_name":"Curvature","score":0.48590001463890076},{"id":"https://openalex.org/keywords/tangent","display_name":"Tangent","score":0.4512999951839447},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.4471000134944916},{"id":"https://openalex.org/keywords/space-mapping","display_name":"Space mapping","score":0.44119998812675476},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.415800005197525},{"id":"https://openalex.org/keywords/oblique-case","display_name":"Oblique case","score":0.4113999903202057},{"id":"https://openalex.org/keywords/multi-objective-optimization","display_name":"Multi-objective optimization","score":0.39419999718666077},{"id":"https://openalex.org/keywords/optimization-problem","display_name":"Optimization problem","score":0.3815999925136566}],"concepts":[{"id":"https://openalex.org/C529865628","wikidata":"https://www.wikidata.org/wiki/Q1790740","display_name":"Manifold (fluid mechanics)","level":2,"score":0.49149999022483826},{"id":"https://openalex.org/C195065555","wikidata":"https://www.wikidata.org/wiki/Q214881","display_name":"Curvature","level":2,"score":0.48590001463890076},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.48179998993873596},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4814000129699707},{"id":"https://openalex.org/C138187205","wikidata":"https://www.wikidata.org/wiki/Q131251","display_name":"Tangent","level":2,"score":0.4512999951839447},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.4471000134944916},{"id":"https://openalex.org/C68028875","wikidata":"https://www.wikidata.org/wiki/Q7572595","display_name":"Space mapping","level":2,"score":0.44119998812675476},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4212999939918518},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.415800005197525},{"id":"https://openalex.org/C160697094","wikidata":"https://www.wikidata.org/wiki/Q1233197","display_name":"Oblique case","level":2,"score":0.4113999903202057},{"id":"https://openalex.org/C68781425","wikidata":"https://www.wikidata.org/wiki/Q2052203","display_name":"Multi-objective optimization","level":2,"score":0.39419999718666077},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.3815999925136566},{"id":"https://openalex.org/C151876577","wikidata":"https://www.wikidata.org/wiki/Q7049464","display_name":"Nonlinear dimensionality reduction","level":3,"score":0.3720000088214874},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3677000105381012},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35589998960494995},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.3515999913215637},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.34540000557899475},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.33719998598098755},{"id":"https://openalex.org/C137635306","wikidata":"https://www.wikidata.org/wiki/Q182667","display_name":"Pareto principle","level":2,"score":0.3343999981880188},{"id":"https://openalex.org/C29513896","wikidata":"https://www.wikidata.org/wiki/Q7489239","display_name":"Shape optimization","level":3,"score":0.3176000118255615},{"id":"https://openalex.org/C164752517","wikidata":"https://www.wikidata.org/wiki/Q5570875","display_name":"Global optimization","level":2,"score":0.3124000132083893},{"id":"https://openalex.org/C130367717","wikidata":"https://www.wikidata.org/wiki/Q189791","display_name":"Diagonal","level":2,"score":0.3059000074863434},{"id":"https://openalex.org/C157157409","wikidata":"https://www.wikidata.org/wiki/Q909601","display_name":"Tangent space","level":2,"score":0.29510000348091125},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.27399998903274536},{"id":"https://openalex.org/C51485801","wikidata":"https://www.wikidata.org/wiki/Q16966861","display_name":"Efficient frontier","level":3,"score":0.26600000262260437},{"id":"https://openalex.org/C184720557","wikidata":"https://www.wikidata.org/wiki/Q7825049","display_name":"Topology (electrical circuits)","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.23000","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.23000","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.23000","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.23000","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"large":[1],"language":[2],"models":[3,139],"(LLMs)":[4],"have":[5,75],"emerged":[6],"as":[7],"a":[8,104,110],"significant":[9],"advancement":[10],"in":[11,84,164],"artificial":[12],"intelligence,":[13],"the":[14,27,47,81,91,94,119,123,135],"hardware":[15],"and":[16,36,100,113,129,137,144,148,155,168],"computational":[17,156],"costs":[18],"for":[19,61],"training":[20,62],"LLMs":[21],"are":[22],"also":[23],"significantly":[24,145],"burdensome.":[25],"Among":[26],"state-of-the-art":[28],"optimizers,":[29],"AdamW":[30,147],"relies":[31],"on":[32,103,134],"diagonal":[33],"curvature":[34,51],"estimates":[35],"ignores":[37],"structural":[38],"properties,":[39],"while":[40,70],"Muon":[41,149],"applies":[42],"global":[43],"spectral":[44],"normalization":[45],"at":[46],"expense":[48],"of":[49,97,166],"losing":[50],"information.":[52],"In":[53],"this":[54],"study,":[55],"we":[56,108],"restriked":[57],"manifold":[58,72,127],"optimization":[59,73,128],"methods":[60,74],"LLMs,":[63],"which":[64],"may":[65],"address":[66],"both":[67],"optimizers'":[68],"limitations,":[69],"conventional":[71],"been":[76],"largely":[77],"overlooked":[78],"due":[79],"to":[80,121],"poor":[82],"performance":[83,124],"large-scale":[85],"model":[86,98],"optimization.":[87],"By":[88],"innovatively":[89],"projecting":[90],"momentum":[92],"onto":[93],"tangent":[95],"space":[96,167],"parameters":[99],"constraining":[101],"it":[102],"rotational":[105],"Oblique":[106],"manifold,":[107],"propose":[109],"novel,":[111],"powerful,":[112],"efficient":[114],"optimizer":[115],"**Mano**":[116],"that":[117,141],"is":[118],"first":[120],"bridge":[122],"gap":[125],"between":[126],"modern":[130],"optimizers.":[131],"Extensive":[132],"experiments":[133],"LLaMA":[136],"Qwen3":[138],"demonstrate":[140],"Mano":[142],"consistently":[143],"outperforms":[146],"even":[150],"with":[151],"less":[152],"memory":[153],"consumption":[154],"complexity,":[157],"respectively,":[158],"suggesting":[159],"an":[160],"expanded":[161],"Pareto":[162],"frontier":[163],"terms":[165],"time":[169],"efficiency.":[170]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-03T00:00:00"}
