{"id":"https://openalex.org/W7140089859","doi":"https://doi.org/10.1109/ton.2026.3676382","title":"RailS: Load Balancing for All-to-All Communication in Distributed Mixture-of-Experts Training","display_name":"RailS: Load Balancing for All-to-All Communication in Distributed Mixture-of-Experts Training","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7140089859","doi":"https://doi.org/10.1109/ton.2026.3676382"},"language":null,"primary_location":{"id":"doi:10.1109/ton.2026.3676382","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ton.2026.3676382","pdf_url":null,"source":{"id":"https://openalex.org/S5407042750","display_name":"IEEE Transactions on Networking","issn_l":"2998-4157","issn":["2998-4157"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Networking","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130372911","display_name":"Heng Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I4210152380","display_name":"Shenzhen Technology University","ror":"https://ror.org/04qzpec27","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210152380"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Heng Xu","raw_affiliation_strings":["Faculty of Computer Science and Artificial Intelligence, Shenzhen University of Advanced Technology, Shenzhen, China"],"raw_orcid":"https://orcid.org/0009-0001-1599-8200","affiliations":[{"raw_affiliation_string":"Faculty of Computer Science and Artificial Intelligence, Shenzhen University of Advanced Technology, Shenzhen, China","institution_ids":["https://openalex.org/I4210152380","https://openalex.org/I3131625388"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhiwei Yu","orcid":"https://orcid.org/0009-0009-7776-4985"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiwei Yu","raw_affiliation_strings":["Institute for Network Sciences and Cyberspace, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-7776-4985","affiliations":[{"raw_affiliation_string":"Institute for Network Sciences and Cyberspace, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126735484","display_name":"Chengze Du","orcid":null},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I4210152380","display_name":"Shenzhen Technology University","ror":"https://ror.org/04qzpec27","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210152380"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chengze Du","raw_affiliation_strings":["Faculty of Computer Science and Artificial Intelligence, Shenzhen University of Advanced Technology, Shenzhen, China"],"raw_orcid":"https://orcid.org/0009-0005-5313-7750","affiliations":[{"raw_affiliation_string":"Faculty of Computer Science and Artificial Intelligence, Shenzhen University of Advanced Technology, Shenzhen, China","institution_ids":["https://openalex.org/I4210152380","https://openalex.org/I3131625388"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130379354","display_name":"Ying Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I4210152380","display_name":"Shenzhen Technology University","ror":"https://ror.org/04qzpec27","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210152380"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Zhou","raw_affiliation_strings":["Faculty of Computer Science and Artificial Intelligence, Shenzhen University of Advanced Technology, Shenzhen, China"],"raw_orcid":"https://orcid.org/0009-0001-9803-262X","affiliations":[{"raw_affiliation_string":"Faculty of Computer Science and Artificial Intelligence, Shenzhen University of Advanced Technology, Shenzhen, China","institution_ids":["https://openalex.org/I4210152380","https://openalex.org/I3131625388"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103214651","display_name":"Letian Li","orcid":"https://orcid.org/0000-0001-6856-1785"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Letian Li","raw_affiliation_strings":["Department of Information Engineering, The Chinese University of Hong Kong, Shatin, Hong Kong"],"raw_orcid":"https://orcid.org/0000-0001-6856-1785","affiliations":[{"raw_affiliation_string":"Department of Information Engineering, The Chinese University of Hong Kong, Shatin, Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124081168","display_name":"Haojie Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I180662265","display_name":"China Mobile (China)","ror":"https://ror.org/05gftfe97","country_code":"CN","type":"company","lineage":["https://openalex.org/I180662265"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haojie Wang","raw_affiliation_strings":["China Mobile, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-2721-7475","affiliations":[{"raw_affiliation_string":"China Mobile, Beijing, China","institution_ids":["https://openalex.org/I180662265"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101198011","display_name":"Weiqiang Cheng","orcid":null},"institutions":[{"id":"https://openalex.org/I180662265","display_name":"China Mobile (China)","ror":"https://ror.org/05gftfe97","country_code":"CN","type":"company","lineage":["https://openalex.org/I180662265"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiqiang Cheng","raw_affiliation_strings":["China Mobile, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"China Mobile, Beijing, China","institution_ids":["https://openalex.org/I180662265"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5130358739","display_name":"Jialong Li","orcid":null},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I4210152380","display_name":"Shenzhen Technology University","ror":"https://ror.org/04qzpec27","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210152380"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jialong Li","raw_affiliation_strings":["Faculty of Computer Science and Artificial Intelligence, Shenzhen University of Advanced Technology, Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0003-3416-5551","affiliations":[{"raw_affiliation_string":"Faculty of Computer Science and Artificial Intelligence, Shenzhen University of Advanced Technology, Shenzhen, China","institution_ids":["https://openalex.org/I4210152380","https://openalex.org/I3131625388"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.39693737,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"34","issue":null,"first_page":"4431","last_page":"4448"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12101","display_name":"Advanced Bandit Algorithms Research","score":0.06639999896287918,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12101","display_name":"Advanced Bandit Algorithms Research","score":0.06639999896287918,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.05999999865889549,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.057999998331069946,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.54339998960495},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5333999991416931},{"id":"https://openalex.org/keywords/load-management","display_name":"Load management","score":0.3450999855995178},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.2538999915122986},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.24250000715255737}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6561999917030334},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.54339998960495},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5333999991416931},{"id":"https://openalex.org/C2779370713","wikidata":"https://www.wikidata.org/wiki/Q357554","display_name":"Load management","level":2,"score":0.3450999855995178},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.30790001153945923},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.30070000886917114},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.2538999915122986},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.24250000715255737},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.24160000681877136},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.23890000581741333}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ton.2026.3676382","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ton.2026.3676382","pdf_url":null,"source":{"id":"https://openalex.org/S5407042750","display_name":"IEEE Transactions on Networking","issn_l":"2998-4157","issn":["2998-4157"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Networking","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":46,"referenced_works":["https://openalex.org/W1978175770","https://openalex.org/W1998471240","https://openalex.org/W2011282943","https://openalex.org/W2062362478","https://openalex.org/W2087178199","https://openalex.org/W2099657323","https://openalex.org/W2103110737","https://openalex.org/W2104680817","https://openalex.org/W2117702591","https://openalex.org/W2126210439","https://openalex.org/W2129386480","https://openalex.org/W2130531694","https://openalex.org/W2132320636","https://openalex.org/W2136850962","https://openalex.org/W2149804187","https://openalex.org/W2157614013","https://openalex.org/W2157990152","https://openalex.org/W2167025919","https://openalex.org/W2169246522","https://openalex.org/W2194775991","https://openalex.org/W2298436731","https://openalex.org/W2465793152","https://openalex.org/W2523697921","https://openalex.org/W2540750097","https://openalex.org/W2744387122","https://openalex.org/W2744698795","https://openalex.org/W2770706713","https://openalex.org/W2809353470","https://openalex.org/W2896457183","https://openalex.org/W2993199282","https://openalex.org/W3161886301","https://openalex.org/W4232284301","https://openalex.org/W4249923717","https://openalex.org/W4256524787","https://openalex.org/W4290991080","https://openalex.org/W4290991419","https://openalex.org/W4386385105","https://openalex.org/W4388478995","https://openalex.org/W4392024151","https://openalex.org/W4401176521","https://openalex.org/W4401176799","https://openalex.org/W4401176895","https://openalex.org/W4413126060","https://openalex.org/W4413757104","https://openalex.org/W4413915552","https://openalex.org/W4414281281"],"related_works":[],"abstract_inverted_index":{"Training":[0],"Mixture-of-Experts":[1],"(MoE)":[2],"models":[3],"introduces":[4],"sparse":[5],"and":[6,97,107,122],"highly":[7],"imbalanced":[8],"all-to-all":[9,40],"communication":[10],"that":[11,38,54],"dominates":[12],"iteration":[13,118],"time.":[14],"Conventional":[15],"load-balancing":[16,36],"methods":[17],"fail":[18],"to":[19,52,78],"exploit":[20],"the":[21,48],"deterministic":[22],"topology":[23],"of":[24],"Rail":[25,49],"architectures,":[26],"leaving":[27],"multi-NIC":[28],"bandwidth":[29,104],"underutilized.":[30],"We":[31],"present":[32],"RailS,":[33],"a":[34,70],"distributed":[35,132],"framework":[37],"minimizes":[39],"completion":[41,109],"time":[42,110,119],"in":[43,131],"MoE":[44,99],"training.":[45,133],"RailS":[46,85,101],"leverages":[47],"topology\u2019s":[50],"symmetry":[51],"prove":[53],"uniform":[55,58],"sending":[56],"ensures":[57],"receiving,":[59],"transforming":[60],"global":[61],"coordination":[62],"into":[63],"local":[64,83],"scheduling.":[65],"Each":[66],"node":[67],"independently":[68],"executes":[69],"Longest":[71],"Processing":[72],"Time":[73],"First":[74],"(LPT)":[75],"spraying":[76],"scheduler":[77],"proactively":[79],"balance":[80],"traffic":[81],"using":[82],"information.":[84],"activates":[86],"N":[87],"parallel":[88],"rails":[89],"for":[90],"fine-grained,":[91],"topology-aware":[92],"multipath":[93],"transmission.":[94],"Across":[95],"synthetic":[96],"real-world":[98],"workloads,":[100,115],"improves":[102],"bus":[103],"by":[105,111,120],"20%\u201378%":[106],"reduces":[108],"17%\u201378%.":[112],"For":[113],"Mixtral":[114],"it":[116],"shortens":[117],"18%\u201340%":[121],"achieves":[123],"near-optimal":[124],"load":[125],"balance,":[126],"fully":[127],"exploiting":[128],"architectural":[129],"parallelism":[130]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-24T00:00:00"}
