{"id":"https://openalex.org/W7139927015","doi":"https://doi.org/10.1016/j.procs.2026.01.105","title":"Sparse Mixture-of-Experts Transformers for Efficient Scaling of Large Language Models","display_name":"Sparse Mixture-of-Experts Transformers for Efficient Scaling of Large Language Models","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7139927015","doi":"https://doi.org/10.1016/j.procs.2026.01.105"},"language":"en","primary_location":{"id":"doi:10.1016/j.procs.2026.01.105","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.procs.2026.01.105","pdf_url":null,"source":{"id":"https://openalex.org/S120348307","display_name":"Procedia Computer Science","issn_l":"1877-0509","issn":["1877-0509"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Procedia Computer Science","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1016/j.procs.2026.01.105","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5037528486","display_name":"Taher M. Ghazal","orcid":"https://orcid.org/0000-0003-0672-7924"},"institutions":[{"id":"https://openalex.org/I885383172","display_name":"National University of Malaysia","ror":"https://ror.org/00bw8d226","country_code":"MY","type":"education","lineage":["https://openalex.org/I885383172"]}],"countries":["MY"],"is_corresponding":true,"raw_author_name":"Taher M. Ghazal","raw_affiliation_strings":["Faculty of Computing and IT, Sohar University, Oman, Department of Networks and Cybersecurity, Hourani Center for Applied Scientific Research, Al-Ahliyya Amman University, Amman, Jordan. Center for Cyber Security, Faculty of Information Science and Technology, Universiti Kebangsaan Malaysia (UKM), 43600 Bangi, Selangor, Malaysia"],"affiliations":[{"raw_affiliation_string":"Faculty of Computing and IT, Sohar University, Oman, Department of Networks and Cybersecurity, Hourani Center for Applied Scientific Research, Al-Ahliyya Amman University, Amman, Jordan. Center for Cyber Security, Faculty of Information Science and Technology, Universiti Kebangsaan Malaysia (UKM), 43600 Bangi, Selangor, Malaysia","institution_ids":["https://openalex.org/I885383172"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5037528486"],"corresponding_institution_ids":["https://openalex.org/I885383172"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93159769,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":"275","issue":null,"first_page":"923","last_page":"930"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.17299999296665192,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.17299999296665192,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.16619999706745148,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1136000007390976,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6796000003814697},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5938000082969666},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4537000060081482},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.2770000100135803}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9174000024795532},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6796000003814697},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5938000082969666},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4537000060081482},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.40790000557899475},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.38760000467300415},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37869998812675476},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3181999921798706},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.2770000100135803},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.24699999392032623}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1016/j.procs.2026.01.105","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.procs.2026.01.105","pdf_url":null,"source":{"id":"https://openalex.org/S120348307","display_name":"Procedia Computer Science","issn_l":"1877-0509","issn":["1877-0509"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Procedia Computer Science","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1016/j.procs.2026.01.105","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.procs.2026.01.105","pdf_url":null,"source":{"id":"https://openalex.org/S120348307","display_name":"Procedia Computer Science","issn_l":"1877-0509","issn":["1877-0509"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Procedia Computer Science","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.4674840271472931}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W3195809871","https://openalex.org/W3210350108","https://openalex.org/W4200189282","https://openalex.org/W4297509986","https://openalex.org/W4366198760","https://openalex.org/W4391610999","https://openalex.org/W4391966553","https://openalex.org/W4392284622","https://openalex.org/W4392704508","https://openalex.org/W4394753105","https://openalex.org/W4395087141","https://openalex.org/W4399109643","https://openalex.org/W4399440433","https://openalex.org/W4399811679","https://openalex.org/W4400526653","https://openalex.org/W4401864268","https://openalex.org/W4402100932","https://openalex.org/W4403299787","https://openalex.org/W4403987255","https://openalex.org/W4404651128","https://openalex.org/W4406810573","https://openalex.org/W4406857280","https://openalex.org/W4406911809","https://openalex.org/W4407065044","https://openalex.org/W4407667759","https://openalex.org/W4408128024","https://openalex.org/W4408252407","https://openalex.org/W4408725402","https://openalex.org/W4408729471","https://openalex.org/W4408781028","https://openalex.org/W4408788024","https://openalex.org/W4408858386","https://openalex.org/W4410006666","https://openalex.org/W4410099343","https://openalex.org/W4410529690","https://openalex.org/W4410616178","https://openalex.org/W4413757113"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"are":[4,23,73],"the":[5],"best":[6],"at":[7],"processing":[8],"natural":[9],"language.":[10],"Due":[11],"to":[12,58,85,111,154],"their":[13],"growing":[14],"size":[15],"and":[16,21,38,54,61,120,131,140,143,156,165,169],"computing":[17],"needs,":[18],"scalability,":[19],"efficiency,":[20],"deployment":[22],"difficult.":[24],"Traditional":[25],"dense":[26,136],"transformer":[27],"designs":[28],"activate":[29],"all":[30],"parameters":[31],"during":[32],"inference,":[33],"increasing":[34,118],"power":[35,100],"consumption,":[36],"delay,":[37],"cost.":[39],"The":[40,107],"biggest":[41],"challenge":[42],"is":[43,149],"scaling":[44],"LLMs":[45],"without":[46,101],"compromising":[47,102],"performance,":[48],"interpretability,":[49],"or":[50,116],"generalizability.":[51],"Current":[52],"compression":[53],"parameter-sharing":[55],"methods":[56],"fail":[57],"balance":[59],"efficiency":[60],"accuracy":[62,139],"in":[63,105,113,138,158],"large-scale":[64,78],"deployments,":[65],"despite":[66],"slight":[67],"advantages.":[68],"Sparse":[69,88],"Mixture-of-Experts":[70],"Transformers":[71],"(SMOE-T)":[72],"a":[74],"novel":[75],"paradigm":[76],"for":[77,95,126],"language":[79,127],"modeling":[80],"that":[81],"uses":[82],"conditional":[83],"computation":[84],"improve":[86],"efficiency.":[87],"gating":[89],"activates":[90],"only":[91],"select":[92],"expert":[93],"modules":[94],"each":[96],"input,":[97],"saving":[98],"computational":[99],"model":[103],"expressiveness":[104],"SMoE-T.":[106],"approach":[108],"allows":[109,163],"experts":[110],"specialize":[112],"specific":[114],"languages":[115],"subjects,":[117],"productivity":[119],"flexibility.":[121],"On":[122],"common":[123],"NLP":[124],"datasets":[125],"modeling,":[128],"machine":[129],"translation,":[130],"question":[132],"answering,":[133],"SMoE-T":[134,148,162],"outperforms":[135],"transformers":[137],"reduces":[141],"FLOPs":[142],"inference":[144],"latency":[145],"by":[146],"60%.":[147],"scalable,":[150],"making":[151],"it":[152],"easy":[153],"train":[155],"deploy":[157],"distributed":[159],"systems.":[160],"Finally,":[161],"resource-efficient":[164],"scalable":[166],"LLM":[167],"training":[168],"deployment,":[170],"enabling":[171],"long-term,":[172],"widely":[173],"available":[174],"generative":[175],"AI":[176],"solutions.":[177]},"counts_by_year":[],"updated_date":"2026-03-22T06:25:25.174409","created_date":"2026-03-21T00:00:00"}
