{"id":"https://openalex.org/W7116666918","doi":"https://doi.org/10.1109/tit.2025.3647061","title":"Convergence Rates for Softmax Gating Mixture of Experts","display_name":"Convergence Rates for Softmax Gating Mixture of Experts","publication_year":2025,"publication_date":"2025-12-22","ids":{"openalex":"https://openalex.org/W7116666918","doi":"https://doi.org/10.1109/tit.2025.3647061"},"language":null,"primary_location":{"id":"doi:10.1109/tit.2025.3647061","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tit.2025.3647061","pdf_url":null,"source":{"id":"https://openalex.org/S4502562","display_name":"IEEE Transactions on Information Theory","issn_l":"0018-9448","issn":["0018-9448","1557-9654"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Information Theory","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120975056","display_name":"Huy Nguyen","orcid":null},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Huy Nguyen","raw_affiliation_strings":["Department of Statistics and Data Sciences, The University of Texas at Austin, Austin, TX, USA"],"affiliations":[{"raw_affiliation_string":"Department of Statistics and Data Sciences, The University of Texas at Austin, Austin, TX, USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112412955","display_name":"Nhat Ho","orcid":null},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nhat Ho","raw_affiliation_strings":["Department of Statistics and Data Sciences, The University of Texas at Austin, Austin, TX, USA"],"affiliations":[{"raw_affiliation_string":"Department of Statistics and Data Sciences, The University of Texas at Austin, Austin, TX, USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112007828","display_name":"Alessandro Rinaldo","orcid":null},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alessandro Rinaldo","raw_affiliation_strings":["Department of Statistics and Data Sciences, The University of Texas at Austin, Austin, TX, USA"],"affiliations":[{"raw_affiliation_string":"Department of Statistics and Data Sciences, The University of Texas at Austin, Austin, TX, USA","institution_ids":["https://openalex.org/I86519309"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5120975056"],"corresponding_institution_ids":["https://openalex.org/I86519309"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.70836021,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"72","issue":"2","first_page":"1276","last_page":"1304"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12101","display_name":"Advanced Bandit Algorithms Research","score":0.21289999783039093,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12101","display_name":"Advanced Bandit Algorithms Research","score":0.21289999783039093,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.12240000069141388,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.1136000007390976,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.9164000153541565},{"id":"https://openalex.org/keywords/gating","display_name":"Gating","score":0.7311000227928162},{"id":"https://openalex.org/keywords/identifiability","display_name":"Identifiability","score":0.5909000039100647},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5623999834060669},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.5406000018119812},{"id":"https://openalex.org/keywords/differential","display_name":"Differential (mechanical device)","score":0.3422999978065491},{"id":"https://openalex.org/keywords/subnet","display_name":"Subnet","score":0.328000009059906}],"concepts":[{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.9164000153541565},{"id":"https://openalex.org/C194544171","wikidata":"https://www.wikidata.org/wiki/Q21105679","display_name":"Gating","level":2,"score":0.7311000227928162},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6621999740600586},{"id":"https://openalex.org/C122770356","wikidata":"https://www.wikidata.org/wiki/Q1656753","display_name":"Identifiability","level":2,"score":0.5909000039100647},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5623999834060669},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.5406000018119812},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4957999885082245},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4851999878883362},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3686999976634979},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3513000011444092},{"id":"https://openalex.org/C93226319","wikidata":"https://www.wikidata.org/wiki/Q193137","display_name":"Differential (mechanical device)","level":2,"score":0.3422999978065491},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3312999904155731},{"id":"https://openalex.org/C21099817","wikidata":"https://www.wikidata.org/wiki/Q7631721","display_name":"Subnet","level":2,"score":0.328000009059906},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.32280001044273376},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.32199999690055847},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.30140000581741333},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.26510000228881836},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C167928553","wikidata":"https://www.wikidata.org/wiki/Q1376021","display_name":"Estimation theory","level":2,"score":0.2540000081062317}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tit.2025.3647061","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tit.2025.3647061","pdf_url":null,"source":{"id":"https://openalex.org/S4502562","display_name":"IEEE Transactions on Information Theory","issn_l":"0018-9448","issn":["0018-9448","1557-9654"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Information Theory","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W306662213","https://openalex.org/W1526081617","https://openalex.org/W1538452572","https://openalex.org/W1581352608","https://openalex.org/W1969128414","https://openalex.org/W1994108141","https://openalex.org/W2025653905","https://openalex.org/W2027628336","https://openalex.org/W2061155774","https://openalex.org/W2098288588","https://openalex.org/W2146444015","https://openalex.org/W2147503863","https://openalex.org/W2148845857","https://openalex.org/W2150884987","https://openalex.org/W2502486585","https://openalex.org/W3081932626","https://openalex.org/W3196950043","https://openalex.org/W4385245566","https://openalex.org/W4389520168","https://openalex.org/W4402671950","https://openalex.org/W4415797268"],"related_works":[],"abstract_inverted_index":{"Mixture":[0],"of":[1,37,48,75,98,105,137,190,200],"experts":[2,59,154],"(MoE)":[3],"has":[4,82],"recently":[5],"emerged":[6],"as":[7,187],"an":[8,40],"effective":[9],"framework":[10],"for":[11,102],"deploying":[12],"machine":[13],"learning":[14],"models":[15],"in":[16,67,85,197],"a":[17,52,69,94,120,124,162,188],"scalable":[18],"and":[19,55,108,116,123],"efficient":[20],"way":[21],"by":[22],"softly":[23],"dividing":[24],"complex":[25],"tasks":[26],"among":[27],"multiple":[28],"specialized":[29],"sub-models":[30],"termed":[31],"experts.":[32],"Central":[33],"to":[34,51,152],"the":[35,46,73,76,80,86,99,103,135,176,198],"success":[36],"MoE":[38,81],"is":[39],"adaptive":[41],"gating":[42,78,115,122],"mechanism":[43],"which":[44,174,194],"determines":[45],"relevance":[47],"each":[49],"expert":[50,109,139],"given":[53],"input":[54],"then":[56],"dynamically":[57],"assigns":[58],"their":[60],"respective":[61],"weights.":[62],"Despite":[63],"its":[64],"widespread":[65],"use":[66],"practice,":[68],"comprehensive":[70],"study":[71],"on":[72,79],"effects":[74],"softmax":[77,114,126],"been":[83],"lacking":[84],"literature.":[87],"To":[88],"bridge":[89],"this":[90],"gap,":[91],"we":[92,143,195],"conduct":[93],"thorough":[95],"theoretical":[96,129],"analysis":[97],"convergence":[100],"rates":[101],"problem":[104],"parameter":[106,192],"estimation":[107],"estimation.":[110],"We":[111],"consider":[112],"standard":[113],"several":[117],"variants,":[118],"including":[119],"dense-to-sparse":[121],"hierarchical":[125],"gating.":[127],"Our":[128],"results":[130],"provide":[131],"useful":[132],"insights":[133],"into":[134],"design":[136],"sample-efficient":[138],"structures.":[140],"In":[141,168],"particular,":[142],"demonstrate":[144],"that":[145],"it":[146],"requires":[147],"polynomially":[148],"many":[149,184],"data":[150,185],"points":[151,186],"estimate":[153],"satisfying":[155],"our":[156],"proposed":[157],"strong":[158],"identifiability":[159],"condition,":[160,181],"namely":[161],"commonly":[163],"used":[164],"two-layer":[165],"feed-forward":[166],"network.":[167],"stark":[169],"contrast,":[170],"estimating":[171],"linear":[172],"experts,":[173],"violate":[175],"<italic":[177],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[178],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">strong":[179],"identifiability</i>":[180],"necessitates":[182],"exponentially":[183],"result":[189],"intrinsic":[191],"interactions,":[193],"express":[196],"language":[199],"partial":[201],"differential":[202],"equations.":[203]},"counts_by_year":[],"updated_date":"2026-01-25T23:04:38.658462","created_date":"2025-12-22T00:00:00"}
