{"id":"https://openalex.org/W7124163283","doi":"https://doi.org/10.48550/arxiv.2601.08215","title":"Towards Principled Design of Mixture-of-Experts Language Models under Memory and Inference Constraints","display_name":"Towards Principled Design of Mixture-of-Experts Language Models under Memory and Inference Constraints","publication_year":2026,"publication_date":"2026-01-13","ids":{"openalex":"https://openalex.org/W7124163283","doi":"https://doi.org/10.48550/arxiv.2601.08215"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.08215","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.08215","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.08215","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048228893","display_name":"Seng Pei Liew","orcid":"https://orcid.org/0000-0003-2419-2505"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liew, Seng Pei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055117550","display_name":"Kenta Shinzato","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shinzato, Kenta","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123002010","display_name":"Yuyang Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Yuyang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5048228893"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2989000082015991,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2989000082015991,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.14100000262260437,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.11410000175237656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.6633999943733215},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5354999899864197},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.49619999527931213},{"id":"https://openalex.org/keywords/forcing","display_name":"Forcing (mathematics)","score":0.47600001096725464},{"id":"https://openalex.org/keywords/core","display_name":"Core (optical fiber)","score":0.45249998569488525},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.4433000087738037},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4339999854564667}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6711000204086304},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.6633999943733215},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5354999899864197},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.505299985408783},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.49619999527931213},{"id":"https://openalex.org/C197115733","wikidata":"https://www.wikidata.org/wiki/Q1003136","display_name":"Forcing (mathematics)","level":2,"score":0.47600001096725464},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.45249998569488525},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.4433000087738037},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4339999854564667},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.41290000081062317},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.358599990606308},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3172999918460846},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3109000027179718},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.30140000581741333},{"id":"https://openalex.org/C134261354","wikidata":"https://www.wikidata.org/wiki/Q938438","display_name":"Statistical inference","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C163175372","wikidata":"https://www.wikidata.org/wiki/Q3339222","display_name":"Linear model","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2524000108242035},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.08215","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.08215","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.08215","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.08215","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"Mixture-of-Experts":[1],"(MoE)":[2],"language":[3],"models":[4],"are":[5,25],"designed":[6],"based":[7],"on":[8],"total":[9,45,67],"parameters":[10,15,46],"(memory":[11],"footprint)":[12],"and":[13,48,54,83,105,121],"active":[14],"(inference":[16],"cost).":[17],"However,":[18],"we":[19,36],"find":[20],"these":[21],"two":[22],"factors":[23],"alone":[24],"insufficient":[26],"to":[27,85],"describe":[28],"an":[29],"optimal":[30],"architecture.":[31],"Through":[32],"a":[33,65,76,91,114],"systematic":[34],"study,":[35],"demonstrate":[37],"that":[38],"MoE":[39,95,123],"performance":[40,73],"is":[41],"primarily":[42],"determined":[43],"by":[44,74],"($N_{total}$)":[47],"expert":[49],"sparsity":[50,62],"($s:=n_{exp}/n_{topk}$).":[51],"Moreover,":[52],"$n_{exp}$":[53,106],"$n_{topk}$":[55],"do":[56],"not":[57],"\"cancel":[58],"out\"":[59],"within":[60],"the":[61,108],"ratio;":[63],"instead,":[64],"larger":[66],"number":[68],"of":[69],"experts":[70],"slightly":[71],"penalizes":[72],"forcing":[75],"reduction":[77],"in":[78],"core":[79],"model":[80],"dimensions":[81],"(depth":[82],"width)":[84],"meet":[86],"memory":[87],"constraints.":[88,110],"This":[89],"motivates":[90],"simple":[92],"principle":[93],"for":[94,117],"design":[96],"which":[97],"maximizes":[98],"$N_{total}$":[99],"while":[100],"minimizing":[101],"$s$":[102],"(maximizing":[103],"$n_{topk}$)":[104],"under":[107],"given":[109],"Our":[111],"findings":[112],"provide":[113],"robust":[115],"framework":[116],"resolving":[118],"architectural":[119],"ambiguity":[120],"guiding":[122],"design.":[124]},"counts_by_year":[],"updated_date":"2026-01-15T23:21:31.212559","created_date":"2026-01-15T00:00:00"}
