{"id":"https://openalex.org/W4376652719","doi":"https://doi.org/10.1145/3577193.3593704","title":"A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training","display_name":"A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training","publication_year":2023,"publication_date":"2023-06-20","ids":{"openalex":"https://openalex.org/W4376652719","doi":"https://doi.org/10.1145/3577193.3593704"},"language":"en","primary_location":{"id":"doi:10.1145/3577193.3593704","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3577193.3593704","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3577193.3593704","source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 37th International Conference on Supercomputing","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3577193.3593704","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101642651","display_name":"Siddharth Singh","orcid":"https://orcid.org/0000-0002-2756-4290"},"institutions":[{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Siddharth Singh","raw_affiliation_strings":["Department of Computer Science, University of Maryland, College Park, Maryland, USA"],"raw_orcid":"https://orcid.org/0000-0002-2756-4290","affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Maryland, College Park, Maryland, USA","institution_ids":["https://openalex.org/I66946132"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022644245","display_name":"Olatunji Ruwase","orcid":"https://orcid.org/0000-0002-5508-0728"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Olatunji Ruwase","raw_affiliation_strings":["Microsoft, Inc., Redmond, Washington, USA"],"raw_orcid":"https://orcid.org/0000-0002-5508-0728","affiliations":[{"raw_affiliation_string":"Microsoft, Inc., Redmond, Washington, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004330728","display_name":"Ammar Ahmad Awan","orcid":"https://orcid.org/0000-0002-6272-3760"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ammar Ahmad Awan","raw_affiliation_strings":["Microsoft, Inc., Redmond, Washington, United States"],"raw_orcid":"https://orcid.org/0000-0002-6272-3760","affiliations":[{"raw_affiliation_string":"Microsoft, Inc., Redmond, Washington, United States","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069595628","display_name":"Samyam Rajbhandari","orcid":"https://orcid.org/0000-0002-0386-8759"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Samyam Rajbhandari","raw_affiliation_strings":["Microsoft, Inc., Redmond, Washington, USA"],"raw_orcid":"https://orcid.org/0000-0002-0386-8759","affiliations":[{"raw_affiliation_string":"Microsoft, Inc., Redmond, Washington, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040302174","display_name":"Yuxiong He","orcid":"https://orcid.org/0000-0003-0478-8854"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yuxiong He","raw_affiliation_strings":["Microsoft, Inc., Redmond, Washington, United States of America"],"raw_orcid":"https://orcid.org/0000-0003-0478-8854","affiliations":[{"raw_affiliation_string":"Microsoft, Inc., Redmond, Washington, United States of America","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5081506338","display_name":"Abhinav Bhatel\u00e9","orcid":"https://orcid.org/0000-0003-3069-3701"},"institutions":[{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Abhinav Bhatele","raw_affiliation_strings":["Department of Computer Science, University of Maryland, College Park, Maryland, United States of America"],"raw_orcid":"https://orcid.org/0000-0003-3069-3701","affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Maryland, College Park, Maryland, United States of America","institution_ids":["https://openalex.org/I66946132"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":7.1156,"has_fulltext":true,"cited_by_count":36,"citation_normalized_percentile":{"value":0.98337292,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"203","last_page":"214"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12101","display_name":"Advanced Bandit Algorithms Research","score":0.9444000124931335,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9437000155448914,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8430911302566528},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.6992750763893127},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.5881549715995789},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.49464842677116394},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.47662580013275146},{"id":"https://openalex.org/keywords/base","display_name":"Base (topology)","score":0.4688970148563385},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4502677917480469},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.43531888723373413},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4269988536834717},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.39858245849609375}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8430911302566528},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.6992750763893127},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.5881549715995789},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.49464842677116394},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.47662580013275146},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.4688970148563385},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4502677917480469},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.43531888723373413},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4269988536834717},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39858245849609375},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C111368507","wikidata":"https://www.wikidata.org/wiki/Q43518","display_name":"Oceanography","level":1,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3577193.3593704","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3577193.3593704","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3577193.3593704","source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 37th International Conference on Supercomputing","raw_type":"proceedings-article"},{"id":"pmh:oai:drum.lib.umd.edu:1903/30507","is_oa":true,"landing_page_url":"http://hdl.handle.net/1903/30507","pdf_url":"https://drum.lib.umd.edu/bitstreams/0a07600b-cd1b-4237-b027-db31ca551c48/download","source":{"id":"https://openalex.org/S4306401518","display_name":"University Libraries (University of Maryland)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I66946132","host_organization_name":"University of Maryland, College Park","host_organization_lineage":["https://openalex.org/I66946132"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Article"},{"id":"doi:10.13016/dspace/u6zm-83ii","is_oa":true,"landing_page_url":"https://doi.org/10.13016/dspace/u6zm-83ii","pdf_url":null,"source":{"id":"https://openalex.org/S4306402644","display_name":"Digital Repository at the University of Maryland (University of Maryland College Park)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I66946132","host_organization_name":"University of Maryland, College Park","host_organization_lineage":["https://openalex.org/I66946132"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"doi:10.1145/3577193.3593704","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3577193.3593704","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3577193.3593704","source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 37th International Conference on Supercomputing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1313983767","display_name":null,"funder_award_id":"DE-AC02","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G1489425746","display_name":null,"funder_award_id":"DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G1528409558","display_name":null,"funder_award_id":"No. DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G1645119126","display_name":null,"funder_award_id":"AC05-00OR22725","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G1719536385","display_name":null,"funder_award_id":"DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G2101401529","display_name":null,"funder_award_id":"Contract No. DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G2777053550","display_name":null,"funder_award_id":"AC02-06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G2887060908","display_name":null,"funder_award_id":"No. DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G3049477530","display_name":null,"funder_award_id":"Contract No. DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G3075337988","display_name":null,"funder_award_id":"06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G4423657506","display_name":null,"funder_award_id":"AC05-00OR22725","funder_id":"https://openalex.org/F4320338287","funder_display_name":"Oak Ridge National Laboratory"},{"id":"https://openalex.org/G4679938283","display_name":null,"funder_award_id":"Contract No. DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G4790718960","display_name":null,"funder_award_id":"DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G4957472368","display_name":null,"funder_award_id":". DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G498139845","display_name":null,"funder_award_id":"DE-AC02","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G5085543421","display_name":null,"funder_award_id":"AC02-06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G5817817810","display_name":null,"funder_award_id":"No. DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G6486853205","display_name":null,"funder_award_id":"Contract No. DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320338287","funder_display_name":"Oak Ridge National Laboratory"},{"id":"https://openalex.org/G6558272803","display_name":null,"funder_award_id":"DE-AC02","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G675320460","display_name":null,"funder_award_id":"DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320338287","funder_display_name":"Oak Ridge National Laboratory"},{"id":"https://openalex.org/G6848031779","display_name":null,"funder_award_id":"06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G6911681599","display_name":null,"funder_award_id":"No. DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G6918803902","display_name":null,"funder_award_id":"06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G7368700025","display_name":null,"funder_award_id":"No. DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G7995982022","display_name":null,"funder_award_id":"DE-AC05","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G8143874970","display_name":null,"funder_award_id":"AC02-06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G8269158468","display_name":null,"funder_award_id":"AC05-00OR22725","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G8286507625","display_name":null,"funder_award_id":"DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320338287","funder_display_name":"Oak Ridge National Laboratory"},{"id":"https://openalex.org/G8394328975","display_name":null,"funder_award_id":"Contract No. DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G8906985441","display_name":null,"funder_award_id":"00OR22725","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G969889393","display_name":null,"funder_award_id":"DE-AC02-","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"}],"funders":[{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320332359","display_name":"Office of Science","ror":"https://ror.org/00mmn6b08"},{"id":"https://openalex.org/F4320338284","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63"},{"id":"https://openalex.org/F4320338287","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4376652719.pdf","grobid_xml":"https://content.openalex.org/works/W4376652719.grobid-xml"},"referenced_works_count":25,"referenced_works":["https://openalex.org/W1493774699","https://openalex.org/W1566289585","https://openalex.org/W2064675550","https://openalex.org/W2983101505","https://openalex.org/W3001279689","https://openalex.org/W3040573126","https://openalex.org/W3086105743","https://openalex.org/W3159351344","https://openalex.org/W3167220634","https://openalex.org/W3175937250","https://openalex.org/W3199518308","https://openalex.org/W3205803342","https://openalex.org/W4221150592","https://openalex.org/W4221167110","https://openalex.org/W4224299101","https://openalex.org/W4225905768","https://openalex.org/W4226515448","https://openalex.org/W4281399297","https://openalex.org/W4286824557","https://openalex.org/W4287391717","https://openalex.org/W4287780487","https://openalex.org/W4288089799","https://openalex.org/W4293718192","https://openalex.org/W4294433898","https://openalex.org/W4386768656"],"related_works":["https://openalex.org/W1554644772","https://openalex.org/W2950520577","https://openalex.org/W2494130044","https://openalex.org/W2003935582","https://openalex.org/W3209384898","https://openalex.org/W74409296","https://openalex.org/W1991844655","https://openalex.org/W1595834484","https://openalex.org/W4230999561","https://openalex.org/W1595672120"],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE)":[1],"is":[2],"a":[3,14,52,110,119],"neural":[4],"network":[5],"architecture":[6],"that":[7,58,93],"adds":[8],"sparsely":[9],"activated":[10],"expert":[11,63],"blocks":[12],"to":[13,37,65],"base":[15,44,75,127],"model,":[16],"increasing":[17],"the":[18,67,78,87],"number":[19],"of":[20,69,107],"parameters":[21],"without":[22,113],"impacting":[23],"computational":[24],"costs.":[25],"However,":[26],"current":[27,79],"distributed":[28],"deep":[29],"learning":[30],"frameworks":[31],"are":[32],"limited":[33],"in":[34,86,102],"their":[35],"ability":[36],"train":[38],"high-quality":[39],"MoE":[40,70,123],"models":[41,71,76],"with":[42,72,129],"large":[43],"models.":[45],"In":[46],"this":[47],"work,":[48],"we":[49],"present":[50],"DeepSpeed-TED,":[51],"novel,":[53],"three-dimensional,":[54],"hybrid":[55],"parallel":[56],"algorithm":[57],"combines":[59],"data,":[60],"tensor,":[61],"and":[62,90,104],"parallelism":[64],"enable":[66],"training":[68,118],"4--8\u00d7":[73],"larger":[74],"than":[77],"state-of-the-art.":[80],"We":[81,98],"also":[82],"describe":[83],"memory":[84],"optimizations":[85,92],"optimizer":[88],"step,":[89],"communication":[91,115],"eliminate":[94],"unnecessary":[95],"data":[96],"movement.":[97],"implement":[99],"our":[100,114],"approach":[101],"DeepSpeed":[103],"achieve":[105],"speedups":[106],"26%":[108],"over":[109],"baseline":[111],"(i.e.":[112],"optimizations)":[116],"when":[117],"40":[120],"billion":[121,126],"parameter":[122],"model":[124,128],"(6.7":[125],"16":[130],"experts)":[131],"on":[132],"128":[133],"V100":[134],"GPUs.":[135]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":25},{"year":2024,"cited_by_count":6}],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2023-05-17T00:00:00"}
