{"id":"https://openalex.org/W7125789388","doi":"https://doi.org/10.48550/arxiv.2601.17680","title":"$\\infty$-MoE: Generalizing Mixture of Experts to Infinite Experts","display_name":"$\\infty$-MoE: Generalizing Mixture of Experts to Infinite Experts","publication_year":2026,"publication_date":"2026-01-25","ids":{"openalex":"https://openalex.org/W7125789388","doi":"https://doi.org/10.48550/arxiv.2601.17680"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.17680","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.17680","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.17680","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124004546","display_name":"Shota Takashiro","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Takashiro, Shota","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123910016","display_name":"Takeshi Kojima","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kojima, Takeshi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124035347","display_name":"Shohei Taniguchi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Taniguchi, Shohei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123943376","display_name":"Yusuke Iwasawa","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Iwasawa, Yusuke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123897017","display_name":"Yutaka Matsuo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Matsuo, Yutaka","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5124004546"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.5248000025749207,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.5248000025749207,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.05460000038146973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.040699999779462814,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5393000245094299},{"id":"https://openalex.org/keywords/expert-system","display_name":"Expert system","score":0.453000009059906},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.34769999980926514},{"id":"https://openalex.org/keywords/mixture-model","display_name":"Mixture model","score":0.3059000074863434},{"id":"https://openalex.org/keywords/discrete-time-and-continuous-time","display_name":"Discrete time and continuous time","score":0.28200000524520874}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6051999926567078},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5393000245094299},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.453000009059906},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4361000061035156},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.42739999294281006},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3249000012874603},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.319599986076355},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.3059000074863434},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.28790000081062317},{"id":"https://openalex.org/C55689738","wikidata":"https://www.wikidata.org/wiki/Q15963867","display_name":"Discrete time and continuous time","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.25679999589920044},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.2524999976158142},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.17680","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.17680","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.17680","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.17680","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"Mixture":[1],"of":[2,46,64,73,76,101,136,154],"Experts":[3],"(MoE)":[4],"selects":[5,70],"a":[6,37,41,71,91,110,126,144],"few":[7],"feed-forward":[8],"networks":[9],"(FFNs)":[10],"per":[11],"token,":[12],"achieving":[13],"an":[14,98,152],"effective":[15],"trade-off":[16,146],"between":[17,147],"computational":[18,105],"cost":[19],"and":[20,32,118,149],"performance.":[21],"In":[22],"conventional":[23,161],"MoE,":[24],"each":[25,54,85],"expert":[26,55],"is":[27],"treated":[28],"as":[29],"entirely":[30],"independent,":[31],"experts":[33,47,89,102,138],"are":[34],"combined":[35],"in":[36,90,158],"discrete":[38],"space.":[39],"As":[40],"result,":[42],"when":[43],"the":[44,62,74,134],"number":[45,63,100,135],"increases,":[48],"it":[49],"becomes":[50],"difficult":[51],"to":[52,125,156],"train":[53],"effectively.":[56],"To":[57],"stabilize":[58],"training":[59],"while":[60,103],"increasing":[61],"experts,":[65],"we":[66],"propose":[67],"$\\infty$-MoE":[68,113],"that":[69,109],"portion":[72],"parameters":[75],"large":[77],"FFNs":[78],"based":[79],"on":[80],"continuous":[81,92],"values":[82],"sampled":[83,137],"for":[84,97,143],"token.":[86],"By":[87],"considering":[88],"space,":[93],"this":[94],"approach":[95],"allows":[96,142],"infinite":[99],"maintaining":[104],"efficiency.":[106],"Experiments":[107],"show":[108],"GPT-2":[111,128],"Small-based":[112],"model,":[114],"with":[115,130,151],"129M":[116],"active":[117],"186M":[119],"total":[120],"parameters,":[121],"achieves":[122],"comparable":[123],"performance":[124],"dense":[127],"Medium":[129],"350M":[131],"parameters.":[132],"Adjusting":[133],"at":[139],"inference":[140],"time":[141],"flexible":[145],"accuracy":[148,159],"speed,":[150],"improvement":[153],"up":[155],"2.5\\%":[157],"over":[160],"MoE.":[162]},"counts_by_year":[],"updated_date":"2026-01-28T23:18:48.515280","created_date":"2026-01-28T00:00:00"}
