{"id":"https://openalex.org/W4416036393","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.807","title":"SliceMoE: Routing Embedding Slices Instead of Tokens for Fine-Grained and Balanced Transformer Scaling","display_name":"SliceMoE: Routing Embedding Slices Instead of Tokens for Fine-Grained and Balanced Transformer Scaling","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416036393","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.807"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.807","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.807","pdf_url":"https://aclanthology.org/2025.emnlp-main.807.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.807.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119914039","display_name":"Harshil Vejendla","orcid":null},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]},{"id":"https://openalex.org/I4210096112","display_name":"Rutgers Sexual and Reproductive Health and Rights","ror":"https://ror.org/00rcvgx40","country_code":"NL","type":"other","lineage":["https://openalex.org/I4210096112"]},{"id":"https://openalex.org/I4210123151","display_name":"R\u00fctgers (Germany)","ror":"https://ror.org/02wmkbh90","country_code":"DE","type":"company","lineage":["https://openalex.org/I4210123151"]}],"countries":["DE","NL","US"],"is_corresponding":true,"raw_author_name":"Harshil Vejendla","raw_affiliation_strings":["Rutgers University -New Brunswick"],"affiliations":[{"raw_affiliation_string":"Rutgers University -New Brunswick","institution_ids":["https://openalex.org/I4210123151","https://openalex.org/I102322142","https://openalex.org/I4210096112"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5119914039"],"corresponding_institution_ids":["https://openalex.org/I102322142","https://openalex.org/I4210096112","https://openalex.org/I4210123151"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35142607,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"15982","last_page":"15989"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11527","display_name":"3D IC and TSV technologies","score":0.2410999983549118,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11527","display_name":"3D IC and TSV technologies","score":0.2410999983549118,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12224","display_name":"Nanofabrication and Lithography Techniques","score":0.20839999616146088,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11301","display_name":"Advanced Surface Polishing Techniques","score":0.02800000086426735,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5914000272750854},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5073000192642212},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4790000021457672},{"id":"https://openalex.org/keywords/routing","display_name":"Routing (electronic design automation)","score":0.3203999996185303}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6934000253677368},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5914000272750854},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5073000192642212},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4790000021457672},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4569000005722046},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.3203999996185303},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.30250000953674316},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.28119999170303345},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2671999931335449},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2522999942302704}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.807","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.807","pdf_url":"https://aclanthology.org/2025.emnlp-main.807.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.807","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.807","pdf_url":"https://aclanthology.org/2025.emnlp-main.807.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416036393.pdf","grobid_xml":"https://content.openalex.org/works/W4416036393.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE)":[1],"layers":[2],"scale":[3],"transformers":[4],"by":[5],"routing":[6],"tokens":[7,82],"to":[8,22,118],"a":[9,42,57,92],"sparse":[10],"subset":[11],"of":[12,41],"feed-forward":[13],"experts.Token-level":[14],"routing,":[15],"however,":[16],"assigns":[17],"an":[18,35,85],"entire":[19],"semantic":[20,141],"spectrum":[21],"each":[23,55],"expert,":[24,86],"creating":[25],"capacity":[26,94],"bottlenecks,":[27],"load-balancing":[28],"pathologies,":[29],"and":[30,53,71,98,110,131],"limited":[31],"specialisation.We":[32],"introduce":[33],"SliceMoE,":[34],"architecture":[36],"that":[37],"routes":[38],"contiguous":[39],"slices":[40,69,79],"token's":[43],"hidden":[44],"vector.A":[45],"d-dimensional":[46],"embedding":[47],"is":[48,88],"partitioned":[49],"into":[50],"S":[51],"slices,":[52],"for":[54],"slice,":[56],"lightweight":[58],"shared":[59],"router":[60],"predicts":[61],"the":[62],"top-k":[63],"experts.Experts":[64],"operate":[65],"on":[66,103],"their":[67],"assigned":[68],"independently,":[70],"outputs":[72],"are":[73],"re-assembled,":[74],"maintaining":[75],"per-token":[76],"FLOP":[77],"efficiency.Because":[78],"from":[80],"different":[81],"interleave":[83],"within":[84],"utilisation":[87],"naturally":[89],"smoother.We":[90],"propose":[91],"slice-level":[93],"loss,":[95],"cross-slice":[96],"dropout,":[97],"efficient":[99],"fused":[100],"batched-GEMM":[101],"kernels.Experiments":[102],"WikiText-103":[104],"language":[105],"modelling,":[106],"WMT":[107],"En-De":[108],"translation,":[109],"three":[111],"text-classification":[112],"datasets":[113],"show":[114],"SliceMoE":[115],"attains":[116],"up":[117],"1.7":[119],"faster":[120],"inference":[121],"than":[122,128],"dense":[123],"baselines,":[124],"12-18%":[125],"lower":[126],"perplexity":[127],"parameter-matched":[129],"token-MoE,":[130],"improved":[132],"expert":[133],"balance,":[134],"with":[135],"interpretable":[136],"expertise":[137],"over":[138],"syntactic":[139],"versus":[140],"sub-spaces.":[142]},"counts_by_year":[],"updated_date":"2026-03-09T07:00:12.390032","created_date":"2025-11-08T00:00:00"}
