{"id":"https://openalex.org/W4416203812","doi":"https://doi.org/10.1145/3712285.3759886","title":"X-MoE: Enabling Scalable Training for Emerging Mixture-of-Experts Architectures on HPC Platforms","display_name":"X-MoE: Enabling Scalable Training for Emerging Mixture-of-Experts Architectures on HPC Platforms","publication_year":2025,"publication_date":"2025-11-12","ids":{"openalex":"https://openalex.org/W4416203812","doi":"https://doi.org/10.1145/3712285.3759886"},"language":null,"primary_location":{"id":"doi:10.1145/3712285.3759886","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3712285.3759886","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3712285.3759886","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030158048","display_name":"Yueming Yuan","orcid":"https://orcid.org/0009-0005-7443-6098"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yueming Yuan","raw_affiliation_strings":["University of Illinois Urbana-Champaign, Urbana, USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign, Urbana, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111000900","display_name":"Ahan Gupta","orcid":"https://orcid.org/0000-0003-2664-8545"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ahan Gupta","raw_affiliation_strings":["University of Illinois Urbana-Champaign, Urbana, USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign, Urbana, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103724933","display_name":"Jianping Li","orcid":"https://orcid.org/0009-0008-1408-8564"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jianping Li","raw_affiliation_strings":["University of Illinois Urbana-Champaign, Urbana, USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign, Urbana, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040970890","display_name":"Sajal Dash","orcid":"https://orcid.org/0000-0001-5308-914X"},"institutions":[{"id":"https://openalex.org/I1289243028","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56","country_code":"US","type":"facility","lineage":["https://openalex.org/I1289243028","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I4210159294"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sajal Dash","raw_affiliation_strings":["Oak Ridge National Laboratory (ORNL), Oak Ridge, USA"],"affiliations":[{"raw_affiliation_string":"Oak Ridge National Laboratory (ORNL), Oak Ridge, USA","institution_ids":["https://openalex.org/I1289243028"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101916963","display_name":"Feiyi Wang","orcid":"https://orcid.org/0000-0002-0099-1559"},"institutions":[{"id":"https://openalex.org/I1289243028","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56","country_code":"US","type":"facility","lineage":["https://openalex.org/I1289243028","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I4210159294"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Feiyi Wang","raw_affiliation_strings":["Oak Ridge National Laboratory (ORNL), Oak Ridge, USA"],"affiliations":[{"raw_affiliation_string":"Oak Ridge National Laboratory (ORNL), Oak Ridge, USA","institution_ids":["https://openalex.org/I1289243028"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5077768924","display_name":"Minjia Zhang","orcid":"https://orcid.org/0000-0002-8165-166X"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Minjia Zhang","raw_affiliation_strings":["University of Illinois Urbana-Champaign, Urbana, USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign, Urbana, USA","institution_ids":["https://openalex.org/I157725225"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5030158048"],"corresponding_institution_ids":["https://openalex.org/I157725225"],"apc_list":null,"apc_paid":null,"fwci":1.2784,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.85765894,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1315","last_page":"1331"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6510000228881836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6510000228881836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.052000001072883606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.025299999862909317,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.8080999851226807},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.7430999875068665},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.590499997138977},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.3718999922275543},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.32829999923706055},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.3237000107765198},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.31040000915527344},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.31029999256134033}],"concepts":[{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.8080999851226807},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7440999746322632},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.7430999875068665},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.590499997138977},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.5080999732017517},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3718999922275543},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3659999966621399},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.32829999923706055},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.3237000107765198},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3160000145435333},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.31040000915527344},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.31029999256134033},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2969000041484833},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2897000014781952},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.28700000047683716},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.2842999994754791},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C20136886","wikidata":"https://www.wikidata.org/wiki/Q749647","display_name":"Interoperability","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C2994168587","wikidata":"https://www.wikidata.org/wiki/Q5295","display_name":"Random access memory","level":2,"score":0.25209999084472656},{"id":"https://openalex.org/C106515295","wikidata":"https://www.wikidata.org/wiki/Q26806595","display_name":"Parallel processing","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3712285.3759886","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3712285.3759886","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3712285.3759886","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3712285.3759886","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W2933138175","https://openalex.org/W2954698171","https://openalex.org/W3081168214","https://openalex.org/W3147460469","https://openalex.org/W3204998121","https://openalex.org/W4220967350","https://openalex.org/W4376652719","https://openalex.org/W4387321091","https://openalex.org/W4388662129","https://openalex.org/W4395117348","https://openalex.org/W4402671950"],"related_works":[],"abstract_inverted_index":{"Emerging":[0],"expert-specialized":[1],"Mixture-of-Experts":[2],"(MoE)":[3],"architectures,":[4],"such":[5],"as":[6],"DeepSeek-MoE,":[7],"deliver":[8,69],"strong":[9],"model":[10,133],"quality":[11],"through":[12],"fine-grained":[13],"expert":[14],"segmentation":[15],"and":[16,30,94],"large":[17],"top-k":[18],"routing.":[19],"However,":[20],"their":[21],"scalability":[22],"is":[23],"limited":[24],"by":[25,108],"substantial":[26],"activation":[27],"memory":[28],"overhead":[29],"costly":[31],"all-to-all":[32],"communication.":[33],"Furthermore,":[34],"current":[35],"MoE":[36,64,75,87,99],"training":[37,65,71,88,145],"systems":[38],"\u2013":[39,45,126],"primarily":[40],"optimized":[41],"for":[42,73],"NVIDIA":[43],"GPUs":[44,125],"perform":[46],"suboptimally":[47],"on":[48,103],"non-NVIDIA":[49],"platforms,":[50],"leaving":[51],"significant":[52],"computational":[53],"potential":[54],"untapped.":[55],"In":[56],"this":[57,79],"work,":[58],"we":[59],"present":[60],"X-MoE,":[61],"a":[62],"novel":[63,82],"system":[66],"designed":[67],"to":[68,119],"scalable":[70],"performance":[72],"next-generation":[74],"architectures.":[76],"X-MoE":[77,114],"achieves":[78],"via":[80],"several":[81],"techniques,":[83],"including":[84],"efficient":[85],"padding-free":[86],"with":[89,97,134],"cross-platform":[90],"kernels,":[91],"redundancy-bypassing":[92],"dispatch,":[93],"hybrid":[95],"parallelism":[96],"sequence-sharded":[98],"blocks.":[100],"Our":[101],"evaluation":[102],"the":[104,130,138],"Frontier":[105],"supercomputer,":[106],"powered":[107],"AMD":[109],"MI250X":[110],"GPUs,":[111],"shows":[112],"that":[113],"scales":[115],"DeepSeek-style":[116],"MoEs":[117],"up":[118],"545":[120],"billion":[121],"parameters":[122],"across":[123],"1024":[124],"10x":[127],"larger":[128],"than":[129],"largest":[131],"trainable":[132],"existing":[135],"methods":[136],"under":[137],"same":[139],"hardware":[140],"budget,":[141],"while":[142],"maintaining":[143],"high":[144],"throughput.":[146]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-11-12T00:00:00"}
