{"id":"https://openalex.org/W4417403193","doi":"https://doi.org/10.1109/pact65351.2025.00043","title":"Generating Two-Level, GPU-Aware Mappings for Distributed Tensor Computations","display_name":"Generating Two-Level, GPU-Aware Mappings for Distributed Tensor Computations","publication_year":2025,"publication_date":"2025-11-03","ids":{"openalex":"https://openalex.org/W4417403193","doi":"https://doi.org/10.1109/pact65351.2025.00043"},"language":null,"primary_location":{"id":"doi:10.1109/pact65351.2025.00043","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00043","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060255270","display_name":"Botao Wu","orcid":null},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Botao Wu","raw_affiliation_strings":["The Ohio State University,Dept. of Computer Science and Engineering,Columbus,OH,USA"],"affiliations":[{"raw_affiliation_string":"The Ohio State University,Dept. of Computer Science and Engineering,Columbus,OH,USA","institution_ids":["https://openalex.org/I52357470"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5048530008","display_name":"Martin Kong","orcid":"https://orcid.org/0000-0001-8008-0220"},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Martin Kong","raw_affiliation_strings":["The Ohio State University,Dept. of Computer Science and Engineering,Columbus,OH,USA"],"affiliations":[{"raw_affiliation_string":"The Ohio State University,Dept. of Computer Science and Engineering,Columbus,OH,USA","institution_ids":["https://openalex.org/I52357470"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5060255270"],"corresponding_institution_ids":["https://openalex.org/I52357470"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.466695,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"401","last_page":"415"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.46129998564720154,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.46129998564720154,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.28859999775886536,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10792","display_name":"Matrix Theory and Algorithms","score":0.04580000042915344,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.5968999862670898},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5508000254631042},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5472000241279602},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.5418000221252441},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5048999786376953},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.4837000072002411},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4390999972820282},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4388999938964844},{"id":"https://openalex.org/keywords/code-generation","display_name":"Code generation","score":0.40639999508857727}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6643000245094299},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.5968999862670898},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5508000254631042},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5472000241279602},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.5418000221252441},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5048999786376953},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.4837000072002411},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4408999979496002},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4390999972820282},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4388999938964844},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.4277999997138977},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.40639999508857727},{"id":"https://openalex.org/C159694833","wikidata":"https://www.wikidata.org/wiki/Q2321565","display_name":"Iterative method","level":2,"score":0.37770000100135803},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3441999852657318},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.3416999876499176},{"id":"https://openalex.org/C146380142","wikidata":"https://www.wikidata.org/wiki/Q1137726","display_name":"Directed graph","level":2,"score":0.3343999981880188},{"id":"https://openalex.org/C2779982483","wikidata":"https://www.wikidata.org/wiki/Q6094420","display_name":"Iterative refinement","level":2,"score":0.319599986076355},{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.31949999928474426},{"id":"https://openalex.org/C51255310","wikidata":"https://www.wikidata.org/wiki/Q1163016","display_name":"Tensor product","level":2,"score":0.3183000087738037},{"id":"https://openalex.org/C61445026","wikidata":"https://www.wikidata.org/wiki/Q217608","display_name":"Fixed point","level":2,"score":0.3154999911785126},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.31139999628067017},{"id":"https://openalex.org/C136119220","wikidata":"https://www.wikidata.org/wiki/Q1000660","display_name":"Algebra over a field","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.30379998683929443},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.30149999260902405},{"id":"https://openalex.org/C74197172","wikidata":"https://www.wikidata.org/wiki/Q1195339","display_name":"Directed acyclic graph","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C18116961","wikidata":"https://www.wikidata.org/wiki/Q734209","display_name":"Generating set of a group","level":2,"score":0.27239999175071716},{"id":"https://openalex.org/C184720557","wikidata":"https://www.wikidata.org/wiki/Q7825049","display_name":"Topology (electrical circuits)","level":2,"score":0.2662999927997589},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.26589998602867126},{"id":"https://openalex.org/C102379954","wikidata":"https://www.wikidata.org/wiki/Q2589940","display_name":"Call graph","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/pact65351.2025.00043","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00043","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W201315547","https://openalex.org/W1480909796","https://openalex.org/W1972501001","https://openalex.org/W2008677874","https://openalex.org/W2056999868","https://openalex.org/W2071758457","https://openalex.org/W2076517649","https://openalex.org/W2098614082","https://openalex.org/W2127226421","https://openalex.org/W2612387305","https://openalex.org/W2791673912","https://openalex.org/W2804032941","https://openalex.org/W2902041647","https://openalex.org/W2926767350","https://openalex.org/W2962976813","https://openalex.org/W2973084438","https://openalex.org/W2984305089","https://openalex.org/W2992165038","https://openalex.org/W3146625038","https://openalex.org/W3174627783","https://openalex.org/W3177445289","https://openalex.org/W4220690649","https://openalex.org/W4318541538","https://openalex.org/W4388667345","https://openalex.org/W4392265917","https://openalex.org/W4416203769"],"related_works":[],"abstract_inverted_index":{"We":[0,68,102],"introduce":[1],"a":[2,20,31,52,75,111],"two-level":[3],"scheme":[4],"to":[5,144],"generate":[6],"GPUaware":[7],"MPI/NCCL":[8],"code":[9],"for":[10,99],"distributed":[11],"tensor":[12,27],"computations.":[13],"Our":[14],"generator":[15,50,109],"takes":[16],"the":[17,43,66,70,86,104,131],"specification":[18],"of":[19,26,48,106,113],"linearized":[21],"Directed":[22],"Acyclic":[23],"Graph":[24],"(DAG)":[25],"operators":[28],"and":[29,40,42,115,128,134],"produces":[30],"global":[32],"mapping":[33,72,108],"solution":[34],"that":[35,56],"considers":[36],"MPI":[37],"communication":[38,63],"(inter":[39],"intranode)":[41],"local":[44],"computation.":[45],"The":[46,90],"core":[47],"our":[49,107],"is":[51,79],"new":[53,91],"bit-vector":[54],"representation":[55],"compactly":[57],"models":[58],"mappings":[59],"as":[60,62],"well":[61],"directions":[64],"along":[65],"grid.":[67],"incorporate":[69],"2-level":[71],"decisions":[73],"into":[74],"non-linear":[76],"formulation":[77],"which":[78],"optimized":[80],"in":[81],"an":[82],"iterative":[83],"fashion":[84],"with":[85,122],"Z3":[87],"SMT":[88],"solver.":[89],"mapper":[92],"supports":[93],"both":[94],"NVIDIA":[95],"NCCL,":[96],"MVAPICH-gdr,":[97],"allowing":[98],"better":[100],"portability.":[101],"demonstrate":[103],"efficiency":[105],"on":[110,118],"set":[112],"matrix-":[114],"tensor-":[116],"DAGs,":[117],"two":[119],"multi-GPU":[120],"clusters":[121],"NVLink":[123],"or":[124],"PCIe":[125],"intra-node":[126],"interconnect,":[127],"compare":[129],"against":[130],"COSMA":[132],"library":[133],"CTF":[135],"framework,":[136],"achieving":[137],"speedups":[138],"ranging":[139],"from":[140],"2.6\u00d7":[141],"(over":[142,146],"COSMA)":[143],"18\u00d7":[145],"CTF).":[147]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-12-16T00:00:00"}
