{"id":"https://openalex.org/W4407213051","doi":"https://doi.org/10.1109/tpds.2025.3539297","title":"EfficientMoE: Optimizing Mixture-of-Experts Model Training With Adaptive Load Balance","display_name":"EfficientMoE: Optimizing Mixture-of-Experts Model Training With Adaptive Load Balance","publication_year":2025,"publication_date":"2025-02-06","ids":{"openalex":"https://openalex.org/W4407213051","doi":"https://doi.org/10.1109/tpds.2025.3539297"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2025.3539297","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3539297","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008416156","display_name":"Yan Zeng","orcid":"https://orcid.org/0000-0003-2026-417X"},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yan Zeng","raw_affiliation_strings":["Hangzhou Dianzi University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Hangzhou Dianzi University, Hangzhou, China","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101323440","display_name":"Chengchuang Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chengchuang Huang","raw_affiliation_strings":["Hangzhou Dianzi University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Hangzhou Dianzi University, Hangzhou, China","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100438301","display_name":"Mei Yang","orcid":"https://orcid.org/0000-0002-4233-7889"},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yipeng Mei","raw_affiliation_strings":["Hangzhou Dianzi University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Hangzhou Dianzi University, Hangzhou, China","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115603648","display_name":"Lifu Zhang","orcid":"https://orcid.org/0000-0002-3533-9966"},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lifu Zhang","raw_affiliation_strings":["Hangzhou Dianzi University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Hangzhou Dianzi University, Hangzhou, China","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101812366","display_name":"Teng Su","orcid":"https://orcid.org/0009-0005-9517-2845"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Teng Su","raw_affiliation_strings":["Huawei Technologies Co Ltd, Distributed Computing Lab, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Huawei Technologies Co Ltd, Distributed Computing Lab, Shenzhen, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008017336","display_name":"Wei Ye","orcid":"https://orcid.org/0000-0003-4905-2015"},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Ye","raw_affiliation_strings":["Hangzhou Dianzi University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Hangzhou Dianzi University, Hangzhou, China","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101631496","display_name":"Wenqi Shi","orcid":"https://orcid.org/0000-0002-9528-0257"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenqi Shi","raw_affiliation_strings":["Huawei Technologies Co Ltd, Distributed Computing Lab, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Huawei Technologies Co Ltd, Distributed Computing Lab, Shenzhen, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100343226","display_name":"Shengnan Wang","orcid":"https://orcid.org/0000-0003-2995-9832"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengnan Wang","raw_affiliation_strings":["Huawei Technologies Co Ltd, Distributed Computing Lab, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Huawei Technologies Co Ltd, Distributed Computing Lab, Shenzhen, China","institution_ids":["https://openalex.org/I2250955327"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5008416156"],"corresponding_institution_ids":["https://openalex.org/I50760025"],"apc_list":null,"apc_paid":null,"fwci":9.9395,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.97367317,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"36","issue":"4","first_page":"677","last_page":"688"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12814","display_name":"Gaussian Processes and Bayesian Inference","score":0.9782999753952026,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12814","display_name":"Gaussian Processes and Bayesian Inference","score":0.9782999753952026,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10711","display_name":"Target Tracking and Data Fusion in Sensor Networks","score":0.9771999716758728,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12879","display_name":"Distributed Sensor Networks and Detection Algorithms","score":0.9690999984741211,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7929647564888},{"id":"https://openalex.org/keywords/balance","display_name":"Balance (ability)","score":0.6278092861175537},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5966509580612183},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.3332313001155853},{"id":"https://openalex.org/keywords/physical-medicine-and-rehabilitation","display_name":"Physical medicine and rehabilitation","score":0.10272899270057678}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7929647564888},{"id":"https://openalex.org/C168031717","wikidata":"https://www.wikidata.org/wiki/Q1530280","display_name":"Balance (ability)","level":2,"score":0.6278092861175537},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5966509580612183},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3332313001155853},{"id":"https://openalex.org/C99508421","wikidata":"https://www.wikidata.org/wiki/Q2678675","display_name":"Physical medicine and rehabilitation","level":1,"score":0.10272899270057678},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2025.3539297","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3539297","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6700000166893005,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1632114991","https://openalex.org/W2150884987","https://openalex.org/W2169717084","https://openalex.org/W2963015836","https://openalex.org/W2969388332","https://openalex.org/W2969766737","https://openalex.org/W2991040477","https://openalex.org/W3019389960","https://openalex.org/W3129831491","https://openalex.org/W3157657667","https://openalex.org/W3213241618","https://openalex.org/W4220967350","https://openalex.org/W4385245566","https://openalex.org/W4386396242","https://openalex.org/W4386709668","https://openalex.org/W4400409890","https://openalex.org/W4402671950","https://openalex.org/W6732520560","https://openalex.org/W6748574626","https://openalex.org/W6755207826","https://openalex.org/W6756379755","https://openalex.org/W6763509872","https://openalex.org/W6766057927","https://openalex.org/W6767997687","https://openalex.org/W6778883912","https://openalex.org/W6780805062","https://openalex.org/W6784333009","https://openalex.org/W6788811087","https://openalex.org/W6791276965","https://openalex.org/W6793102544","https://openalex.org/W6811726652","https://openalex.org/W6839827798","https://openalex.org/W6846422107","https://openalex.org/W6854866820","https://openalex.org/W6860710830"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W230091440","https://openalex.org/W2390279801","https://openalex.org/W2233261550","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2810751659"],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE)":[1],"efficiently":[2],"trains":[3],"large":[4],"models":[5],"by":[6,75,194],"using":[7],"sparse":[8,83],"activation":[9],"to":[10,63,81,101,128,144,175,209],"lower":[11],"costs,":[12],"selecting":[13],"a":[14,111,126,140,151],"few":[15],"experts":[16,79,156,183],"based":[17,114],"on":[18,55,115],"data":[19,119],"characteristics.":[20],"However,":[21],"it":[22],"faces":[23],"challenges":[24,51],"such":[25],"as":[26],"All-to-All":[27,60,162],"communication":[28,61,163,236],"overhead":[29,192],"and":[30,68,89,93,118,164,190,203,218,238,250],"load":[31,69,117,141],"imbalance,":[32],"with":[33,157,214,247],"most":[34],"optimizations":[35],"targeting":[36],"dynamic":[37,152],"graphs":[38],"rather":[39],"than":[40],"the":[41,82,86,90,251],"more":[42],"efficient":[43],"static":[44,56,98,185,255],"graphs.":[45,256],"This":[46,197],"study":[47,198],"identifies":[48],"two":[49,76],"key":[50],"in":[52,201,228,235],"training":[53,230],"MoE":[54,87,212],"graphs:":[57],"1)":[58],"excessive":[59],"(up":[62],"75%":[64],"of":[65,72,85,107,181,226],"iteration":[66],"time)":[67],"imbalance":[70],"(70%":[71],"tokens":[73],"handled":[74],"experts)":[77],"between":[78],"due":[80],"structure":[84],"model":[88,143,172,213,229],"token":[91,133],"distribution;":[92],"2)":[94],"inefficient":[95],"zero-padding":[96],"for":[97,155,179,254],"shapes,":[99],"leading":[100],"unnecessary":[102],"computational":[103,241],"overhead(wasting":[104],"approximately":[105,232],"50%":[106],"resources).":[108],"Thus,":[109],"EfficientMoE,":[110],"scheduling":[112],"method":[113,253],"expert":[116,135,146,159,170],"characteristics,":[120],"is":[121,173],"introduced.":[122],"EfficientMoE":[123,149,200,222],"first":[124],"designs":[125],"sampler":[127],"collect":[129],"real-time":[130],"information":[131],"about":[132],"distribution,":[134],"load,":[136,160],"etc.":[137],"It":[138],"constructs":[139],"prediction":[142],"evaluate":[145,219],"load.":[147],"Subsequently,":[148],"proposes":[150],"schedule":[153],"strategy":[154],"evaluated":[158],"reducing":[161],"addressing":[165],"load-balancing":[166],"issues.":[167],"Additionally,":[168],"an":[169,211,224],"capacity":[171],"proposed":[174],"set":[176],"different":[177,244],"capacities":[178],"replicas":[180],"hot":[182],"before":[184],"graph":[186],"compilation,":[187],"minimizing":[188],"computation":[189],"storage":[191],"caused":[193],"significant":[195],"padding.":[196],"implements":[199],"MindSpore":[202],"uses":[204],"32":[205],"Ascend":[206],"AI":[207],"accelerators":[208],"train":[210],"21":[215],"billion":[216],"parameters":[217],"its":[220],"validity.":[221],"demonstrated":[223],"improvement":[225],"30%":[227],"time,":[231,237],"12%":[233],"reduction":[234],"saved":[239],"35%":[240],"resources":[242],"across":[243],"clusters,":[245],"compared":[246],"Switch":[248],"transformers,":[249],"Fastermoe":[252]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
