{"id":"https://openalex.org/W4411471720","doi":"https://doi.org/10.1145/3695053.3731077","title":"MeshSlice: Efficient 2D Tensor Parallelism for Distributed DNN Training","display_name":"MeshSlice: Efficient 2D Tensor Parallelism for Distributed DNN Training","publication_year":2025,"publication_date":"2025-06-20","ids":{"openalex":"https://openalex.org/W4411471720","doi":"https://doi.org/10.1145/3695053.3731077"},"language":"en","primary_location":{"id":"doi:10.1145/3695053.3731077","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3695053.3731077","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731077","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731077","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025937126","display_name":"Hyoungwook Nam","orcid":null},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hyoungwook Nam","raw_affiliation_strings":["University of Illinois at Urbana-Champaign, Champaign, Illinois, USA"],"raw_orcid":"https://orcid.org/0000-0001-8065-5151","affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign, Champaign, Illinois, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034202662","display_name":"Gerasimos Gerogiannis","orcid":"https://orcid.org/0000-0002-7946-2683"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gerasimos Gerogiannis","raw_affiliation_strings":["University of Illinois at Urbana-Champaign, Champaign, Illinois, USA"],"raw_orcid":"https://orcid.org/0000-0002-7946-2683","affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign, Champaign, Illinois, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055909708","display_name":"Josep Torrellas","orcid":"https://orcid.org/0000-0003-2595-5228"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Josep Torrellas","raw_affiliation_strings":["University of Illinois at Urbana-Champaign, Champaign, Illinois, USA"],"raw_orcid":"https://orcid.org/0000-0003-2595-5228","affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign, Champaign, Illinois, USA","institution_ids":["https://openalex.org/I157725225"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.8048,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.90789721,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"821","last_page":"834"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9884999990463257,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12702","display_name":"Brain Tumor Detection and Classification","score":0.9864000082015991,"subfield":{"id":"https://openalex.org/subfields/2808","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7959253787994385},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.7818102836608887},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6762635707855225},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.617747962474823},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.5009105205535889},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1105424165725708}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7959253787994385},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.7818102836608887},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6762635707855225},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.617747962474823},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.5009105205535889},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1105424165725708},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3695053.3731077","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3695053.3731077","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731077","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3695053.3731077","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3695053.3731077","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731077","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2004457323","display_name":null,"funder_award_id":"CNS 1956007","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G2374893308","display_name":null,"funder_award_id":"1956007","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G4845837577","display_name":null,"funder_award_id":"CCF 2107470","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5485016241","display_name":"SHF: Medium: Cross-Cutting Effort to Make Non-Volatile Memories Truly Usable","funder_award_id":"2107470","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306087","display_name":"Semiconductor Research Corporation","ror":"https://ror.org/047z4n946"},{"id":"https://openalex.org/F4320332180","display_name":"Defense Advanced Research Projects Agency","ror":"https://ror.org/02caytj08"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4411471720.pdf","grobid_xml":"https://content.openalex.org/works/W4411471720.grobid-xml"},"referenced_works_count":15,"referenced_works":["https://openalex.org/W2012652661","https://openalex.org/W2056999868","https://openalex.org/W2057218453","https://openalex.org/W2565436413","https://openalex.org/W2606722458","https://openalex.org/W3006586535","https://openalex.org/W3081168214","https://openalex.org/W3097528158","https://openalex.org/W3150920444","https://openalex.org/W4285503871","https://openalex.org/W4312060029","https://openalex.org/W4380874786","https://openalex.org/W4394998995","https://openalex.org/W4395106447","https://openalex.org/W4395117348"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W230091440","https://openalex.org/W2390279801","https://openalex.org/W2045183646","https://openalex.org/W2233261550","https://openalex.org/W2023505575","https://openalex.org/W1595672120","https://openalex.org/W4230999561"],"abstract_inverted_index":{"In":[0],"distributed":[1,124],"training":[2,177,188],"of":[3,9,18,84,104,148,206],"large":[4],"DNN":[5,125],"models,":[6],"the":[7,81,88,130,149,154,211,222],"scalability":[8,26],"onedimensional":[10],"(1D)":[11],"tensor":[12],"parallelism":[13],"(TP)":[14],"is":[15,77],"limited":[16],"because":[17,29],"its":[19],"high":[20,53,58],"communication":[21,32,67,72,140,150,170],"cost.2D":[22],"TP":[23,46,122],"attains":[24],"extra":[25],"and":[27,61,92,169,213,217],"efficiency":[28,196],"it":[30,76],"reduces":[31],"relative":[33],"to":[34,79,100,198],"1D":[35],"TP.Unfortunately,":[36],"existing":[37],"algorithms":[38],"for":[39,119,172],"general":[40],"matrix":[41],"multiplication":[42],"(GeMM)":[43],"in":[44,123],"2D":[45,63,85,105,116,121,163,202],"suffer":[47],"from":[48,57],"inefficiencies.Indeed,":[49],"Cannon's":[50],"algorithm":[51,118,128],"incurs":[52],"traffic,":[54],"SUMMA":[55],"suffers":[56],"synchronization":[59],"overhead,":[60],"a":[62,94,114,143,204],"GeMM":[64,117,164],"with":[65,73,141],"collective":[66,131],"operations":[68],"does":[69],"not":[70],"overlap":[71],"computation.In":[74],"addition,":[75],"difficult":[78],"optimize":[80],"numerous":[82],"parameters":[83],"TP,":[86],"including":[87],"dataflow,":[89],"mesh":[90,167],"shape,":[91,168],"sharding.As":[93],"result,":[95,144],"human":[96],"experts":[97],"are":[98],"needed":[99],"find":[101],"efficient":[102,120,162],"configurations":[103],"TP.To":[106],"address":[107],"these":[108],"problems,":[109],"this":[110],"paper":[111],"proposes":[112],"MeshSlice,":[113,183],"novel":[115],"training.The":[126],"MeshSlice":[127,145,155,193,209],"slices":[129],"communications":[132],"into":[133],"multiple":[134],"partial":[135],"collectives":[136],"that":[137,192],"allow":[138],"overlapping":[139],"computation.As":[142],"hides":[146],"most":[147],"latency.We":[151],"also":[152],"present":[153],"LLM":[156,189],"autotuner,":[157],"which":[158],"automates":[159],"finding":[160],"an":[161],"dataflow":[165],"configuration,":[166],"granularity":[171],"Large":[173],"Language":[174],"Model":[175],"(LLM)":[176],"using":[178],"analytical":[179],"cost":[180],"models.To":[181],"evaluate":[182],"we":[184],"simulate":[185],"TPUv4":[186],"clusters":[187],"models.We":[190],"show":[191],"maintains":[194],"good":[195],"up":[197],"at":[199],"least":[200],"256-way":[201],"TP.In":[203],"cluster":[205],"256":[207],"TPUs,":[208],"trains":[210],"GPT-3":[212],"Megatron-NLG":[214],"models":[215],"12.0%":[216],"23.4%":[218],"faster,":[219],"respectively,":[220],"than":[221],"state-of-the-art":[223],"algorithm.":[224]},"counts_by_year":[{"year":2026,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
