{"id":"https://openalex.org/W4402351149","doi":"https://doi.org/10.1109/ijcnn60899.2024.10650737","title":"Efficient Routing in Sparse Mixture-of-Experts","display_name":"Efficient Routing in Sparse Mixture-of-Experts","publication_year":2024,"publication_date":"2024-06-30","ids":{"openalex":"https://openalex.org/W4402351149","doi":"https://doi.org/10.1109/ijcnn60899.2024.10650737"},"language":"en","primary_location":{"id":"doi:10.1109/ijcnn60899.2024.10650737","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/ijcnn60899.2024.10650737","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5009550171","display_name":"Masoumeh Zareapoor","orcid":"https://orcid.org/0000-0002-3991-0584"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Masoumeh Zareapoor","raw_affiliation_strings":["Shanghai Jiao Tong University,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079232252","display_name":"Pourya Shamsolmoali","orcid":"https://orcid.org/0000-0002-0263-1661"},"institutions":[{"id":"https://openalex.org/I126231945","display_name":"Queen's University Belfast","ror":"https://ror.org/00hswnk62","country_code":"GB","type":"education","lineage":["https://openalex.org/I126231945"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Pourya Shamsolmoali","raw_affiliation_strings":["Queen&#x2019;s University Belfast,Belfast,UK"],"affiliations":[{"raw_affiliation_string":"Queen&#x2019;s University Belfast,Belfast,UK","institution_ids":["https://openalex.org/I126231945"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5107079437","display_name":"Fateme Vesaghati","orcid":null},"institutions":[{"id":"https://openalex.org/I110525433","display_name":"Islamic Azad University, Tehran","ror":"https://ror.org/01kzn7k21","country_code":"IR","type":"education","lineage":["https://openalex.org/I110525433"]}],"countries":["IR"],"is_corresponding":false,"raw_author_name":"Fateme Vesaghati","raw_affiliation_strings":["Azad University,Tehran,Iran"],"affiliations":[{"raw_affiliation_string":"Azad University,Tehran,Iran","institution_ids":["https://openalex.org/I110525433"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5009550171"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.4141,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.72716665,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10500","display_name":"Sparse and Compressive Sensing Techniques","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6789340376853943},{"id":"https://openalex.org/keywords/routing","display_name":"Routing (electronic design automation)","score":0.5284709334373474},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.4620264172554016},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.32342809438705444},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.18901896476745605}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6789340376853943},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.5284709334373474},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.4620264172554016},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32342809438705444},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.18901896476745605},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/ijcnn60899.2024.10650737","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/ijcnn60899.2024.10650737","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.qub.ac.uk/portal:publications/d03188fb-3caf-4756-871c-171416413840","is_oa":false,"landing_page_url":"https://pure.qub.ac.uk/en/publications/d03188fb-3caf-4756-871c-171416413840","pdf_url":null,"source":{"id":"https://openalex.org/S4306402319","display_name":"Research Portal (Queen's University Belfast)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I126231945","host_organization_name":"Queen's University Belfast","host_organization_lineage":["https://openalex.org/I126231945"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Zareapoor, M, Shamsolmoali, P & Vesaghati, F 2024, Efficient routing in sparse mixture-of-experts. in 2024 International Joint Conference on Neural Networks (IJCNN): Proceedings. International Joint Conference on Neural Networks (IJCNN): Proceedings, Institute of Electrical and Electronics Engineers Inc., 2024 International Joint Conference on Neural Networks (IJCNN), Yokohama, Japan, 30/06/2024. https://doi.org/10.1109/IJCNN60899.2024.10650737","raw_type":"info:eu-repo/semantics/conferenceObject"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":53,"referenced_works":["https://openalex.org/W2108598243","https://openalex.org/W2158131535","https://openalex.org/W2899663614","https://openalex.org/W2923014074","https://openalex.org/W2943552823","https://openalex.org/W2962843773","https://openalex.org/W2962997028","https://openalex.org/W2964110616","https://openalex.org/W2989847975","https://openalex.org/W3023653415","https://openalex.org/W3036601975","https://openalex.org/W3040573126","https://openalex.org/W3094502228","https://openalex.org/W3118210634","https://openalex.org/W3170796112","https://openalex.org/W4226079124","https://openalex.org/W4229042118","https://openalex.org/W4229081350","https://openalex.org/W4287121196","https://openalex.org/W4287391717","https://openalex.org/W4293718192","https://openalex.org/W4297631950","https://openalex.org/W4311252752","https://openalex.org/W4385968027","https://openalex.org/W4387130479","https://openalex.org/W4388650735","https://openalex.org/W4391164109","https://openalex.org/W6682962330","https://openalex.org/W6731370813","https://openalex.org/W6732520560","https://openalex.org/W6737976933","https://openalex.org/W6745495346","https://openalex.org/W6755977528","https://openalex.org/W6759058519","https://openalex.org/W6762392948","https://openalex.org/W6780218876","https://openalex.org/W6780805062","https://openalex.org/W6784333009","https://openalex.org/W6788811087","https://openalex.org/W6793102544","https://openalex.org/W6796487566","https://openalex.org/W6796854725","https://openalex.org/W6810702803","https://openalex.org/W6810737565","https://openalex.org/W6811072154","https://openalex.org/W6843384352","https://openalex.org/W6845546282","https://openalex.org/W6847237767","https://openalex.org/W6848916189","https://openalex.org/W6849790239","https://openalex.org/W6855688861","https://openalex.org/W6858690822","https://openalex.org/W6860432385"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"Sparse":[0],"Mixture-of-Experts":[1],"(MoE)":[2],"architectures":[3],"provide":[4],"the":[5,11,18,28,36,56,65,88,96,109,123,152,165,177],"distinct":[6],"benefit":[7],"of":[8,30,76,112,142,179],"substantially":[9],"expanding":[10],"model\u2019s":[12],"parameter":[13],"space":[14],"without":[15],"proportionally":[16],"increasing":[17],"computational":[19,116,154],"load":[20],"on":[21,35,64],"individual":[22],"input":[23],"tokens":[24,42],"or":[25,51],"samples.":[26],"However,":[27],"efficacy":[29],"these":[31,147],"models":[32],"heavily":[33],"depends":[34],"routing":[37,46,84],"strategy":[38],"used":[39],"to":[40,43,49,73,99,119,146,176],"assign":[41],"experts.":[44,77],"Poor":[45],"can":[47],"lead":[48],"under-trained":[50],"overly":[52],"specialized":[53],"experts,":[54],"diminishing":[55],"overall":[57],"model":[58,135,157,174],"performance.":[59,195],"Previous":[60],"approaches":[61],"have":[62],"relied":[63],"Topk":[66,89],"router,":[67],"where":[68],"each":[69],"token":[70],"is":[71],"assigned":[72],"a":[74,83,105,140],"subset":[75],"In":[78],"this":[79],"paper,":[80],"we":[81],"propose":[82],"mechanism":[85],"that":[86,133,172],"replaces":[87],"router":[90],"with":[91],"regularized":[92],"optimal":[93],"transport,":[94],"leveraging":[95],"Sinkhorn":[97],"algorithm":[98],"optimize":[100],"token-expert":[101,180],"matching.":[102],"We":[103,170],"conducted":[104],"comprehensive":[106],"evaluation":[107],"comparing":[108],"pre-training":[110],"efficiency":[111,192],"our":[113,134,156,173],"model,":[114],"using":[115],"resources":[117],"equivalent":[118],"those":[120],"employed":[121],"in":[122,182,189],"GShard":[124],"and":[125,167,193],"Switch":[126],"Transformers":[127],"gating":[128],"mechanisms.":[129],"The":[130],"results":[131],"demonstrate":[132],"expedites":[136],"training":[137,191],"convergence,":[138],"achieving":[139],"speedup":[141],"over":[143],"2\u00d7":[144],"compared":[145],"baseline":[148],"models.":[149],"Moreover,":[150],"under":[151],"same":[153],"constraints,":[155],"exhibits":[158],"superior":[159],"performance":[160],"across":[161],"eleven":[162],"tasks":[163],"from":[164],"GLUE":[166],"SuperGLUE":[168],"benchmarks.":[169],"show":[171],"contributes":[175],"optimization":[178],"matching":[181],"sparsely-activated":[183],"MoE":[184],"models,":[185],"offering":[186],"substantial":[187],"gains":[188],"both":[190],"task":[194]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-25T13:04:00.132906","created_date":"2025-10-10T00:00:00"}
