{"id":"https://openalex.org/W4416035879","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.43","title":"Bag of Tricks for Sparse Mixture-of-Experts: A Benchmark Across Reasoning, Efficiency, and Safety","display_name":"Bag of Tricks for Sparse Mixture-of-Experts: A Benchmark Across Reasoning, Efficiency, and Safety","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416035879","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.43"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.43","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.43","pdf_url":"https://aclanthology.org/2025.findings-emnlp.43.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-emnlp.43.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Mufan Qiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mufan Qiu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101314676","display_name":"Zheyu Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheyu Shen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102745685","display_name":"Pingzhi Li","orcid":"https://orcid.org/0009-0007-9935-4456"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pingzhi Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100413590","display_name":"Ang Li","orcid":"https://orcid.org/0000-0002-0063-1422"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ang Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5103073431","display_name":"Tianlong Chen","orcid":"https://orcid.org/0000-0001-7774-8197"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tianlong Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32121084,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"817","last_page":"835"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.7465999722480774,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.7465999722480774,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.050999999046325684,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12101","display_name":"Advanced Bandit Algorithms Research","score":0.02449999935925007,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5088000297546387},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.32429999113082886},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.2915000021457672},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.28209999203681946},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.23250000178813934}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6351000070571899},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5088000297546387},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4205999970436096},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.32429999113082886},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3228999972343445},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2915000021457672},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.28209999203681946},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.257999986410141},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.23250000178813934},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.22380000352859497}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.findings-emnlp.43","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.43","pdf_url":"https://aclanthology.org/2025.findings-emnlp.43.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.43","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.43","pdf_url":"https://aclanthology.org/2025.findings-emnlp.43.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416035879.pdf","grobid_xml":"https://content.openalex.org/works/W4416035879.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE)":[1],"has":[2],"emerged":[3],"as":[4,51],"a":[5,17,42],"promising":[6],"approach":[7],"for":[8,203],"scaling":[9],"large":[10],"language":[11],"models":[12],"efficiently.However,":[13],"how":[14],"to":[15,194],"design":[16,48,132,146],"desired":[18],"MoE":[19,99,126,145,210],"architecture":[20],"given":[21],"performance,":[22],"efficiency,":[23,37,108],"or":[24,67],"safety":[25,182],"goals":[26],"remains":[27],"absent.Existing":[28],"benchmarks":[29],"often":[30,83],"focus":[31],"on":[32,46,134],"isolated":[33],"aspects":[34],"(e.g.,":[35],"reasoning,":[36],"safety),":[38],"and":[39,54,62,74,109,120,154,169,173,181,208,213],"there":[40],"is":[41],"lack":[43],"of":[44,56,60,98,105,131,190,221],"consensus":[45],"optimal":[47,114,191],"choices,":[49],"such":[50],"the":[52,58,63,94,102,222],"number":[53],"size":[55],"experts,":[57],"type":[59],"routers,":[61],"regularization":[64,156],"during":[65,78,117],"pre-training,":[66],"strategies":[68,175,192],"like":[69],"freezing,":[70],"learning":[71],"rate":[72],"adjustments,":[73],"limiting":[75],"expert":[76],"collaboration":[77],"fine-tuning,":[79],"with":[80],"prior":[81],"works":[82],"yielding":[84],"conflicting":[85],"conclusions.Motivated":[86],"by":[87],"this":[88,198],"research":[89],"gap,":[90],"we":[91,148,184],"introduce":[92],"MoE-Bench,":[93],"first":[95],"comprehensive":[96],"assessment":[97],"designs":[100],"across":[101,128],"three":[103,186],"dimensions":[104,130],"reasoning":[106,158],"ability,":[107],"safety.Our":[110],"benchmark":[111],"systematically":[112],"evaluates":[113],"architectural":[115],"choices":[116,133],"both":[118],"pre-training":[119],"fine-tuning":[121],"phases.We":[122],"evaluate":[123],"two":[124],"popular":[125],"backbones":[127],"four":[129],"over":[135],"eight":[136],"metrics.Our":[137],"empirical":[138],"findings":[139],"uncover":[140],"hidden":[141],"underlying":[142],"correlations":[143],"among":[144],"choices.Specifically,":[147],"observe":[149],"that":[150],"(1)":[151],"tokenlevel":[152],"routing":[153,172],"z-loss":[155],"improve":[157],"performance;":[159],"(2)":[160],"shared":[161],"experts":[162],"enhance":[163],"training":[164],"stability":[165],"but":[166],"reduce":[167],"specialization;":[168],"(3)":[170],"collaboration-constrained":[171],"freezing":[174],"significantly":[176],"influence":[177],"load":[178],"balance,":[179],"specialization,":[180],"alignment.Furthermore,":[183],"propose":[185],"\"sweet":[187],"point\"":[188],"combinations":[189],"tailored":[193],"different":[195],"scenarios.We":[196],"hope":[197],"study":[199],"provides":[200],"actionable":[201],"insights":[202],"building":[204],"more":[205],"robust,":[206],"efficient,":[207],"secure":[209],"models.Code,":[211],"checkpoints,":[212],"raw":[214],"data":[215],"will":[216],"be":[217],"released":[218],"upon":[219],"acceptance":[220],"paper.":[223],"Mixtureof-ExpertsLLMs":[224]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-08T00:00:00"}
