{"id":"https://openalex.org/W4412945078","doi":"https://doi.org/10.18653/v1/2025.acl-long.816","title":"Automatic Expert Discovery in LLM Upcycling via Sparse Interpolated Mixture-of-Experts","display_name":"Automatic Expert Discovery in LLM Upcycling via Sparse Interpolated Mixture-of-Experts","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412945078","doi":"https://doi.org/10.18653/v1/2025.acl-long.816"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.acl-long.816","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.816","pdf_url":"https://aclanthology.org/2025.acl-long.816.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.acl-long.816.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058266998","display_name":"Shengzhuang Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I4210111299","display_name":"Thomson Reuters Foundation","ror":"https://ror.org/01xpmbc27","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210111299"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shengzhuang Chen","raw_affiliation_strings":["Foundational Research Zhejiang University Thomson Reuters Foundational Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Foundational Research Zhejiang University Thomson Reuters Foundational Research","institution_ids":["https://openalex.org/I4210111299"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100619276","display_name":"Ying Wei","orcid":"https://orcid.org/0000-0002-4247-1770"},"institutions":[{"id":"https://openalex.org/I4210111299","display_name":"Thomson Reuters Foundation","ror":"https://ror.org/01xpmbc27","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210111299"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ying Wei","raw_affiliation_strings":["Foundational Research Zhejiang University Thomson Reuters Foundational Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Foundational Research Zhejiang University Thomson Reuters Foundational Research","institution_ids":["https://openalex.org/I4210111299"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5012775658","display_name":"Jonathan Richard Schwarz","orcid":null},"institutions":[{"id":"https://openalex.org/I4210111299","display_name":"Thomson Reuters Foundation","ror":"https://ror.org/01xpmbc27","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210111299"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jonathan Richard Schwarz","raw_affiliation_strings":["Foundational Research Zhejiang University Thomson Reuters Foundational Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Foundational Research Zhejiang University Thomson Reuters Foundational Research","institution_ids":["https://openalex.org/I4210111299"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18176692,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"16703","last_page":"16717"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.96670001745224,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.96670001745224,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13274","display_name":"Expert finding and Q&A systems","score":0.9602000117301941,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9417999982833862,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6495803594589233},{"id":"https://openalex.org/keywords/drug-discovery","display_name":"Drug discovery","score":0.5197910070419312},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39710018038749695},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.32173359394073486},{"id":"https://openalex.org/keywords/bioinformatics","display_name":"Bioinformatics","score":0.11468160152435303}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6495803594589233},{"id":"https://openalex.org/C74187038","wikidata":"https://www.wikidata.org/wiki/Q1418791","display_name":"Drug discovery","level":2,"score":0.5197910070419312},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39710018038749695},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.32173359394073486},{"id":"https://openalex.org/C60644358","wikidata":"https://www.wikidata.org/wiki/Q128570","display_name":"Bioinformatics","level":1,"score":0.11468160152435303},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.acl-long.816","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.816","pdf_url":"https://aclanthology.org/2025.acl-long.816.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.acl-long.816","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.816","pdf_url":"https://aclanthology.org/2025.acl-long.816.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412945078.pdf","grobid_xml":"https://content.openalex.org/works/W4412945078.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"We":[0],"present":[1],"Sparse":[2],"Interpolated":[3],"Mixture-of-Experts":[4],"(SIMoE)":[5],"instruction-tuning,":[6,31],"an":[7,66,98],"endto-end":[8],"algorithm":[9],"designed":[10],"to":[11,58,103],"fine-tune":[12],"a":[13,21,39,47,72],"dense":[14],"pre-trained":[15],"Large":[16],"Language":[17],"Model":[18],"(LLM)":[19],"into":[20],"MoE-style":[22],"model":[23],"that":[24,56,83],"possesses":[25],"capabilities":[26],"in":[27],"multiple":[28,35],"specialized":[29,36],"domains.During":[30],"SIMoE":[32,87],"automatically":[33],"identifies":[34],"experts":[37],"under":[38],"specified":[40],"sparsity":[41],"constraint,":[42],"with":[43],"each":[44],"expert":[45,68],"representing":[46],"structurally":[48],"sparse":[49],"subset":[50],"of":[51],"the":[52,62],"seed":[53],"LLM's":[54],"parameters":[55],"correspond":[57],"domainspecific":[59],"knowledge":[60,78],"within":[61],"data.SIMoE":[63],"simultaneously":[64],"learns":[65],"input-dependent":[67],"merging":[69],"strategy":[70],"via":[71],"router":[73],"network,":[74],"leveraging":[75],"rich":[76],"cross-expert":[77],"for":[79],"superior":[80],"downstream":[81],"generalization":[82],"surpasses":[84],"existing":[85],"baselines.Empirically,":[86],"consistently":[88],"achieves":[89],"state-of-the-art":[90],"performance":[91],"on":[92],"common":[93],"instruction-tuning":[94],"benchmarks":[95],"while":[96],"maintaining":[97],"optimal":[99],"performance-compute":[100],"trade-off":[101],"compared":[102],"all":[104],"baselines.":[105]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
