{"id":"https://openalex.org/W4406703768","doi":"https://doi.org/10.1145/3669940.3707272","title":"FSMoE: A Flexible and Scalable Training System for Sparse Mixture-of-Experts Models","display_name":"FSMoE: A Flexible and Scalable Training System for Sparse Mixture-of-Experts Models","publication_year":2025,"publication_date":"2025-02-06","ids":{"openalex":"https://openalex.org/W4406703768","doi":"https://doi.org/10.1145/3669940.3707272"},"language":"en","primary_location":{"id":"doi:10.1145/3669940.3707272","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3669940.3707272","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2501.10714","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016665007","display_name":"Xinglin Pan","orcid":"https://orcid.org/0000-0002-1172-9935"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xinglin Pan","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110841034","display_name":"Wenxiang Lin","orcid":"https://orcid.org/0000-0001-5147-0844"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenxiang Lin","raw_affiliation_strings":["Harbin Institute of Technology, Shenzhen, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Shenzhen, Shenzhen, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075963446","display_name":"Lin Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Lin Zhang","raw_affiliation_strings":["Hong Kong University of Science and Technology, Hong Kong SAR, China"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016836702","display_name":"Shaohuai Shi","orcid":"https://orcid.org/0000-0002-1418-5160"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shaohuai Shi","raw_affiliation_strings":["Harbin Institute of Technology, Shenzhen, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Shenzhen, Shenzhen, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048313055","display_name":"Zhenheng Tang","orcid":"https://orcid.org/0000-0001-8769-9974"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Zhenheng Tang","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong SAR, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Rui Wang","orcid":"https://orcid.org/0009-0004-5580-189X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rui Wang","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060171629","display_name":"Bo Li","orcid":"https://orcid.org/0000-0003-2955-750X"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Bo Li","raw_affiliation_strings":["Hong Kong University of Science and Technology, Hong Kong SAR, China"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100730785","display_name":"Xiaowen Chu","orcid":"https://orcid.org/0000-0001-9745-4372"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]},{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Xiaowen Chu","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China, &amp; Hong Kong University of Science and Technology, Hong Kong SAR, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China, &amp; Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I200769079","https://openalex.org/I889458895"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5016665007"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":15.7194,"has_fulltext":true,"cited_by_count":7,"citation_normalized_percentile":{"value":0.98641666,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"524","last_page":"539"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10203","display_name":"Recommender Systems and Techniques","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9919000267982483,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8149771690368652},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5858803987503052},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.564709484577179},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5622626543045044},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.44966283440589905},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.44501814246177673},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.4446384906768799},{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.4426920413970947},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4269736409187317},{"id":"https://openalex.org/keywords/schedule","display_name":"Schedule","score":0.4194684326648712},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.37737417221069336},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.25391650199890137},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.22315546870231628},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.10803043842315674},{"id":"https://openalex.org/keywords/software-engineering","display_name":"Software engineering","score":0.10791563987731934},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.0921541154384613}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8149771690368652},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5858803987503052},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.564709484577179},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5622626543045044},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.44966283440589905},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.44501814246177673},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.4446384906768799},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.4426920413970947},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4269736409187317},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.4194684326648712},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.37737417221069336},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.25391650199890137},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.22315546870231628},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.10803043842315674},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.10791563987731934},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0921541154384613},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3669940.3707272","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3669940.3707272","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2501.10714","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2501.10714","pdf_url":"https://arxiv.org/pdf/2501.10714","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-151467","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-151467","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2501.10714","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2501.10714","pdf_url":"https://arxiv.org/pdf/2501.10714","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1231421488","display_name":null,"funder_award_id":"under","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1477544716","display_name":null,"funder_award_id":"Guangdong","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1560154075","display_name":null,"funder_award_id":"2024A03","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2191774043","display_name":null,"funder_award_id":"62272122","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2981938667","display_name":null,"funder_award_id":"Shenzhen","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3054550026","display_name":null,"funder_award_id":"311511","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3309074268","display_name":null,"funder_award_id":"2010005","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3314459484","display_name":"Collaborative Research:   Algorithms for Large-Scale Stochastic and Nonlinear Optimization","funder_award_id":"1620022","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3491687736","display_name":"MRI: Development of a Scanning Probe Microscopy Tool to Study Nanoscale Photoactivated Processes","funder_award_id":"0923115","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G37568934","display_name":null,"funder_award_id":"Grant","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3860524358","display_name":"Fundamental Aspects of M-theory and Superspace","funder_award_id":"1620742","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G391238517","display_name":null,"funder_award_id":", and","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4020255992","display_name":null,"funder_award_id":"Project","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4913288181","display_name":null,"funder_award_id":"092311","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5939423041","display_name":null,"funder_award_id":"Technology","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6588910480","display_name":"SIGHT: A compact adaptive optics technology demonstrator for all-sky panchromatic optical/infrared spectroscopy","funder_award_id":"2010005","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G717249060","display_name":"WOU-MMA: Maximizing Science Output of LIGO: Data Analysis and Improved Detectors","funder_award_id":"2309231","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7349572720","display_name":null,"funder_award_id":"2022B1212010005","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7726157001","display_name":null,"funder_award_id":"Grant No.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G848032724","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4406703768.pdf","grobid_xml":"https://content.openalex.org/works/W4406703768.grobid-xml"},"referenced_works_count":42,"referenced_works":["https://openalex.org/W2581624817","https://openalex.org/W2809290718","https://openalex.org/W2884711234","https://openalex.org/W2901299405","https://openalex.org/W2945785363","https://openalex.org/W3040573126","https://openalex.org/W3147874613","https://openalex.org/W3193250980","https://openalex.org/W3193985311","https://openalex.org/W3204998121","https://openalex.org/W3205972749","https://openalex.org/W4205983429","https://openalex.org/W4220838824","https://openalex.org/W4220967350","https://openalex.org/W4224211001","https://openalex.org/W4224299101","https://openalex.org/W4224308101","https://openalex.org/W4226515448","https://openalex.org/W4238076109","https://openalex.org/W4281688600","https://openalex.org/W4281922990","https://openalex.org/W4293775970","https://openalex.org/W4312060029","https://openalex.org/W4321636575","https://openalex.org/W4364382874","https://openalex.org/W4376652719","https://openalex.org/W4386260498","https://openalex.org/W4386396242","https://openalex.org/W4387302777","https://openalex.org/W4391420987","https://openalex.org/W4394923534","https://openalex.org/W4395117348","https://openalex.org/W4396815229","https://openalex.org/W4399694321","https://openalex.org/W4401508784","https://openalex.org/W4405756071","https://openalex.org/W6684859321","https://openalex.org/W6739901393","https://openalex.org/W6778883912","https://openalex.org/W6810081322","https://openalex.org/W6810297391","https://openalex.org/W6839827798"],"related_works":["https://openalex.org/W2058965144","https://openalex.org/W2164382479","https://openalex.org/W2146343568","https://openalex.org/W98480971","https://openalex.org/W2150291671","https://openalex.org/W2013643406","https://openalex.org/W2027972911","https://openalex.org/W2157978810","https://openalex.org/W4391547476","https://openalex.org/W2597809628"],"abstract_inverted_index":{"Recent":[0],"large":[1],"language":[2],"models":[3,127,183,298],"(LLMs)":[4],"have":[5],"tended":[6],"to":[7,10,87,110,156,205,237,256,268],"leverage":[8],"sparsity":[9],"reduce":[11],"computations,":[12],"employing":[13],"the":[14,164,276],"sparsely":[15],"activated":[16],"mixture-of-experts":[17],"(MoE)":[18],"technique.":[19],"MoE":[20,47,70,77,122,126,144,166,176,182,222,226,278,291,297],"introduces":[21],"four":[22,140],"modules,":[23],"including":[24],"token":[25,27],"routing,":[26],"communication,":[28],"expert":[29,32],"computation,":[30],"and":[31,38,66,82,107,114,124,147,160,170,178,187,218,232,241,253,260,282,293,302],"parallelism,":[33],"that":[34,135,272],"impact":[35],"model":[36],"quality":[37],"training":[39,54,167,201,279],"efficiency.":[40],"To":[41,92,209],"enable":[42],"ver-":[43],"satile":[44],"usage":[45],"of":[46,69,143,221,286],"models,":[48],"we":[49,97,197,214,229,243],"introduce":[50],"FSMoE,":[51],"a":[52,108,157,190,199,254],"flexible":[53,200],"system":[55,202],"optimizing":[56],"task":[57,73,95,207],"scheduling":[58,74],"with":[59,85,120,235,284],"three":[60],"novel":[61],"techniques:":[62],"1)":[63,136,213],"Unified":[64],"abstraction":[65,217],"online":[67,219],"profiling":[68,220],"modules":[71,223],"for":[72,104,250],"across":[75,224],"various":[76,225],"implementations.":[78],"2)":[79,161,228],"Co-scheduling":[80],"intra-node":[81,231],"inter-node":[83,233],"communications":[84,113,234,259],"computations":[86,236],"minimize":[88,238],"communication":[89,239],"overheads.":[90],"3)":[91,242],"support":[93],"near-optimal":[94],"scheduling,":[96],"design":[98,215,244],"an":[99,245],"adaptive":[100,246],"gradient":[101,105,247,251],"partitioning":[102,248],"method":[103,249],"aggregation":[106,252],"schedule":[109,255],"adaptively":[111,257],"pipeline":[112,258],"computations.":[115,261],"We":[116],"conduct":[117],"extensive":[118],"experiments":[119],"configured":[121],"layers":[123,177,292],"real-world":[125,181,296],"on":[128,174,180,185,264,288,295,300],"two":[129,265],"GPU":[130],"clusters.":[131],"Experimental":[132,262],"results":[133,263],"show":[134,271],"our":[137,273],"FSMoE":[138,162,204,274],"supports":[139],"popular":[141,191],"types":[142],"routing":[145,192],"functions":[146],"is":[148],"more":[149],"efficient":[150],"than":[151],"existing":[152],"implementations":[153],"(with":[154],"up":[155,267],"1.42\u00d7":[158],"speedup),":[159],"outperforms":[163,275],"state-of-the-art":[165,277],"systems":[168,280],"(DeepSpeed-MoE":[169,281],"Tutel)":[171,283],"by":[172],"1.18\u00d7-1.22\u00d7":[173],"1458":[175,289],"1.19\u00d7-3.01\u00d7":[179],"based":[184,299],"GPT-2":[186,301],"Mixtral":[188],"using":[189],"function.":[193],"In":[194],"this":[195,211],"work,":[196],"present":[198],"named":[203],"optimize":[206],"scheduling.":[208],"achieve":[210],"goal:":[212],"unified":[216],"implementations,":[227],"co-schedule":[230],"overhead,":[240],"clusters":[266],"48":[269],"GPUs":[270],"speedups":[285],"1.18x-1.22x":[287],"customized":[290],"1.19x-3.01x":[294],"Mixtral.":[303]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":4}],"updated_date":"2026-04-18T07:56:08.524223","created_date":"2025-10-10T00:00:00"}
