{"id":"https://openalex.org/W4417403812","doi":"https://doi.org/10.1109/pact65351.2025.00015","title":"ScaleMoE: A Fast and Scalable Distributed Training Framework for Large-Scale Mixture-of-Experts Models","display_name":"ScaleMoE: A Fast and Scalable Distributed Training Framework for Large-Scale Mixture-of-Experts Models","publication_year":2025,"publication_date":"2025-11-03","ids":{"openalex":"https://openalex.org/W4417403812","doi":"https://doi.org/10.1109/pact65351.2025.00015"},"language":null,"primary_location":{"id":"doi:10.1109/pact65351.2025.00015","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00015","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Seohong Choi","orcid":null},"institutions":[{"id":"https://openalex.org/I848706","display_name":"Sungkyunkwan University","ror":"https://ror.org/04q78tk20","country_code":"KR","type":"education","lineage":["https://openalex.org/I848706"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Seohong Choi","raw_affiliation_strings":["Sungkyunkwan University,Suwon,South Korea"],"affiliations":[{"raw_affiliation_string":"Sungkyunkwan University,Suwon,South Korea","institution_ids":["https://openalex.org/I848706"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081727510","display_name":"Hyun Pyo Hong","orcid":"https://orcid.org/0000-0003-1451-3141"},"institutions":[{"id":"https://openalex.org/I848706","display_name":"Sungkyunkwan University","ror":"https://ror.org/04q78tk20","country_code":"KR","type":"education","lineage":["https://openalex.org/I848706"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Huize Hong","raw_affiliation_strings":["Sungkyunkwan University,Suwon,South Korea"],"affiliations":[{"raw_affiliation_string":"Sungkyunkwan University,Suwon,South Korea","institution_ids":["https://openalex.org/I848706"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084741187","display_name":"Tae Hee Han","orcid":"https://orcid.org/0000-0001-8508-7536"},"institutions":[{"id":"https://openalex.org/I848706","display_name":"Sungkyunkwan University","ror":"https://ror.org/04q78tk20","country_code":"KR","type":"education","lineage":["https://openalex.org/I848706"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Tae Hee Han","raw_affiliation_strings":["Sungkyunkwan University,Suwon,South Korea"],"affiliations":[{"raw_affiliation_string":"Sungkyunkwan University,Suwon,South Korea","institution_ids":["https://openalex.org/I848706"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5109307149","display_name":"Joon\u2010Sung Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I848706","display_name":"Sungkyunkwan University","ror":"https://ror.org/04q78tk20","country_code":"KR","type":"education","lineage":["https://openalex.org/I848706"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Joonsung Kim","raw_affiliation_strings":["Sungkyunkwan University,Suwon,South Korea"],"affiliations":[{"raw_affiliation_string":"Sungkyunkwan University,Suwon,South Korea","institution_ids":["https://openalex.org/I848706"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I848706"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.40241544,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"30","last_page":"42"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.335999995470047,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.335999995470047,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.13120000064373016,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.08129999786615372,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.862500011920929},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6277999877929688},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5968000292778015},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5902000069618225},{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.349700003862381},{"id":"https://openalex.org/keywords/models-of-communication","display_name":"Models of communication","score":0.3391999900341034},{"id":"https://openalex.org/keywords/expert-system","display_name":"Expert system","score":0.3278000056743622}],"concepts":[{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.862500011920929},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8504999876022339},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.6593000292778015},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6277999877929688},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5968000292778015},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5902000069618225},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.349700003862381},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34290000796318054},{"id":"https://openalex.org/C158156997","wikidata":"https://www.wikidata.org/wiki/Q1416645","display_name":"Models of communication","level":2,"score":0.3391999900341034},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33899998664855957},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.3278000056743622},{"id":"https://openalex.org/C192126672","wikidata":"https://www.wikidata.org/wiki/Q1068715","display_name":"Telecommunications network","level":2,"score":0.3172000050544739},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2874000072479248},{"id":"https://openalex.org/C101765175","wikidata":"https://www.wikidata.org/wiki/Q577764","display_name":"Communications system","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C130120984","wikidata":"https://www.wikidata.org/wiki/Q2835898","display_name":"Distributed algorithm","level":2,"score":0.2750999927520752},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2718000113964081},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.25940001010894775},{"id":"https://openalex.org/C158207573","wikidata":"https://www.wikidata.org/wiki/Q5747224","display_name":"Heterogeneous network","level":4,"score":0.25769999623298645},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.25679999589920044},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25589999556541443},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2531000077724457},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/pact65351.2025.00015","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00015","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320320671","display_name":"National Research Foundation","ror":"https://ror.org/05s0g1g46"},{"id":"https://openalex.org/F4320322064","display_name":"Korea Institute for Advancement of Technology","ror":"https://ror.org/015w1qa96"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1983144994","https://openalex.org/W2150884987","https://openalex.org/W2612387305","https://openalex.org/W2970231061","https://openalex.org/W3039165326","https://openalex.org/W3091097978","https://openalex.org/W3172942063","https://openalex.org/W3196974791","https://openalex.org/W3198659451","https://openalex.org/W3204434815","https://openalex.org/W4213072767","https://openalex.org/W4297097375","https://openalex.org/W4312968147","https://openalex.org/W4318541593","https://openalex.org/W4364382874","https://openalex.org/W4381480627","https://openalex.org/W4385567093","https://openalex.org/W4386260498","https://openalex.org/W4386396242","https://openalex.org/W4394564255","https://openalex.org/W4394923534","https://openalex.org/W4401211627","https://openalex.org/W4402671950","https://openalex.org/W4403348324","https://openalex.org/W4404782031","https://openalex.org/W4405429653","https://openalex.org/W4415796130","https://openalex.org/W4415796439","https://openalex.org/W4416035996"],"related_works":[],"abstract_inverted_index":{"The":[0],"size":[1],"of":[2,100,203],"pre-trained":[3],"models":[4,34,54],"has":[5,21],"continuously":[6],"increased":[7],"to":[8,29,56,108,113,131,153,167,189,205,209],"support":[9],"growing":[10],"demands":[11],"for":[12,50,73],"solving":[13],"more":[14],"complex":[15],"problems.":[16,111],"Especially,":[17],"mixture-of-experts":[18],"(MoE)":[19],"model":[20],"become":[22],"the":[23,41,133,210],"most":[24],"popular":[25],"approach,":[26],"enabling":[27],"systems":[28],"easily":[30],"train":[31],"extremely":[32],"large-scale":[33,52,74],"with":[35],"relatively":[36],"lower":[37],"computational":[38],"requirements.":[39],"However,":[40],"current":[42],"distributed":[43,70,84],"training":[44,71,85],"frameworks":[45],"cannot":[46],"achieve":[47],"scalable":[48,69,181],"performance":[49],"these":[51,110],"MoE":[53,75],"due":[55],"substantial":[57],"communication":[58,89,115,121,156,185],"overheads.":[59],"In":[60,193],"this":[61],"paper,":[62],"we":[63,117,139,158],"propose":[64,104,118,140,159],"ScaleMoE,":[65],"a":[66,148,201],"fast":[67],"and":[68,97],"framework":[72],"models.":[76],"We":[77,103],"first":[78],"identify":[79],"three":[80,105],"problems":[81],"in":[82,94,136],"state-of-the-art":[83,211],"frameworks:":[86],"high":[87],"all-to-all":[88,120,184],"overheads,":[90,157],"severe":[91],"load":[92,134],"imbalance":[93,135],"expert":[95,137,142,161],"selection,":[96,138],"insufficient":[98],"consideration":[99],"heterogeneous":[101,172],"networks.":[102],"novel":[106,149],"optimizations":[107],"resolve":[109],"First,":[112],"reduce":[114],"volumes,":[116],"adaptive":[119],"that":[122,144,163,178],"eliminates":[123],"unnecessary":[124],"zeros":[125],"caused":[126],"by":[127,187],"zero":[128],"padding.":[129],"Second,":[130],"address":[132],"dynamic":[141],"clustering":[143,150],"rebalances":[145],"experts":[146,166],"using":[147],"methodology.":[151],"Lastly,":[152],"further":[154],"minimize":[155],"topology-aware":[160],"remapping":[162],"carefully":[164],"maps":[165],"GPU":[168],"devices":[169],"while":[170],"considering":[171],"network":[173],"bandwidths.":[174],"Our":[175],"evaluations":[176],"show":[177],"ScaleMoE":[179,195],"achieves":[180],"performance,":[182,199],"reducing":[183],"overheads":[186],"up":[188,204],"$\\mathbf{8":[190],"1":[191],"\\%}$.":[192],"general,":[194],"significantly":[196],"improves":[197],"system":[198],"achieving":[200],"speedup":[202],"$3.3":[206],"\\times$":[207],"compared":[208],"framework.":[212]},"counts_by_year":[],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-12-16T00:00:00"}
