{"id":"https://openalex.org/W7116338912","doi":"https://doi.org/10.1145/3754598.3754666","title":"Design and Optimization of GPU-Aware MPI Allreduce Using Direct Sendrecv Communication","display_name":"Design and Optimization of GPU-Aware MPI Allreduce Using Direct Sendrecv Communication","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W7116338912","doi":"https://doi.org/10.1145/3754598.3754666"},"language":null,"primary_location":{"id":"doi:10.1145/3754598.3754666","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754666","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754666","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120920580","display_name":"Chen-Chun Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Chen-Chun Chen","raw_affiliation_strings":["The Ohio State University, Columbus, USA"],"affiliations":[{"raw_affiliation_string":"The Ohio State University, Columbus, USA","institution_ids":["https://openalex.org/I52357470"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015189372","display_name":"Jinghan Yao","orcid":"https://orcid.org/0009-0002-7129-9508"},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinghan Yao","raw_affiliation_strings":["The Ohio State University, Columbus, USA"],"affiliations":[{"raw_affiliation_string":"The Ohio State University, Columbus, USA","institution_ids":["https://openalex.org/I52357470"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042668878","display_name":"H Subramoni","orcid":null},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hari Subramoni","raw_affiliation_strings":["The Ohio State University, Columbus, USA"],"affiliations":[{"raw_affiliation_string":"The Ohio State University, Columbus, USA","institution_ids":["https://openalex.org/I52357470"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5105360581","display_name":"Dhabaleswar K. Panda","orcid":null},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dhabaleswar K. Panda","raw_affiliation_strings":["The Ohio State University, Columbus, USA"],"affiliations":[{"raw_affiliation_string":"The Ohio State University, Columbus, USA","institution_ids":["https://openalex.org/I52357470"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5120920580"],"corresponding_institution_ids":["https://openalex.org/I52357470"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.64517462,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"784","last_page":"793"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6292999982833862,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6292999982833862,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.05959999933838844,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.037700001150369644,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7786999940872192},{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.6323000192642212},{"id":"https://openalex.org/keywords/vendor","display_name":"Vendor","score":0.5533999800682068},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.5138999819755554},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5080999732017517},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.39989998936653137},{"id":"https://openalex.org/keywords/petascale-computing","display_name":"Petascale computing","score":0.3882000148296356},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.35929998755455017}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8371000289916992},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7786999940872192},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.6323000192642212},{"id":"https://openalex.org/C2777338717","wikidata":"https://www.wikidata.org/wiki/Q1762621","display_name":"Vendor","level":2,"score":0.5533999800682068},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5321999788284302},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.5138999819755554},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5080999732017517},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.453900009393692},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4025999903678894},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.39989998936653137},{"id":"https://openalex.org/C185410017","wikidata":"https://www.wikidata.org/wiki/Q7171778","display_name":"Petascale computing","level":3,"score":0.3882000148296356},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.36649999022483826},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.35929998755455017},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.35749998688697815},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C31352089","wikidata":"https://www.wikidata.org/wiki/Q3750474","display_name":"Systems design","level":2,"score":0.3264999985694885},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3190000057220459},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.29660001397132874},{"id":"https://openalex.org/C106516650","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm design","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2741999924182892},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2612999975681305},{"id":"https://openalex.org/C173061102","wikidata":"https://www.wikidata.org/wiki/Q478819","display_name":"Bandwidth throttling","level":3,"score":0.2547999918460846},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3754598.3754666","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754666","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754666","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754666","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.4695058763027191,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"awards":[{"id":"https://openalex.org/G3358531649","display_name":null,"funder_award_id":"2311830","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8189731814","display_name":null,"funder_award_id":"2323116","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8438779844","display_name":null,"funder_award_id":"2312927","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G855500582","display_name":null,"funder_award_id":"2415201","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2057332538","https://openalex.org/W2105549957","https://openalex.org/W2131613942","https://openalex.org/W2955454939","https://openalex.org/W3204371345","https://openalex.org/W4210863326","https://openalex.org/W4389361850","https://openalex.org/W4407785136","https://openalex.org/W4412605653"],"related_works":[],"abstract_inverted_index":{"Modern":[0],"GPU-accelerated":[1],"high-performance":[2],"computing":[3],"(HPC)":[4],"and":[5,49,72,78,88,137],"deep":[6],"learning":[7],"(DL)":[8],"applications":[9],"rely":[10],"heavily":[11],"on":[12,103],"collective":[13],"communication,":[14],"particularly":[15],"the":[16,126,141],"Allreduce":[17,60],"operation.":[18],"As":[19],"systems":[20,106],"scale":[21,44],"to":[22,39,46,69,96,117,131],"hundreds":[23],"or":[24,33],"thousands":[25],"of":[26],"GPUs,":[27],"conventional":[28],"algorithms":[29],"such":[30],"as":[31],"Ring":[32],"vendor":[34],"libraries":[35],"like":[36],"NCCL":[37,113],"struggle":[38],"sustain":[40],"performance":[41],"for":[42,99],"large":[43],"due":[45],"communication":[47],"bottlenecks":[48],"algorithmic":[50],"dependencies.":[51],"In":[52,123],"this":[53],"work,":[54],"we":[55,84],"propose":[56],"a":[57,63],"novel":[58],"GPU-aware":[59],"design":[61,93,128],"using":[62],"Direct":[64],"Sendrecv":[65],"algorithm":[66],"with":[67],"throttling":[68],"improve":[70],"scalability":[71],"bandwidth":[73],"utilization":[74],"across":[75],"various":[76],"scales":[77],"interconnects.":[79],"To":[80],"further":[81],"reduce":[82],"overhead,":[83],"introduce":[85],"computation-communication":[86],"overlap":[87],"kernel":[89],"fusion":[90],"techniques.":[91],"The":[92],"also":[94],"extends":[95],"CPU-staging":[97],"scenarios":[98],"small":[100],"messages.":[101],"Evaluations":[102],"large-scale":[104],"GPU":[105],"demonstrate":[107],"that":[108],"our":[109],"designs":[110],"outperform":[111],"baseline":[112],"implementations":[114],"by":[115],"up":[116,130],"40%":[118],"at":[119],"medium":[120],"message":[121],"sizes.":[122],"application-level":[124],"evaluations,":[125],"proposed":[127],"achieves":[129],"7%":[132],"improvement":[133,139],"in":[134,140],"nanoGPT":[135],"training":[136],"27%":[138],"Amber":[142],"HPC":[143],"simulation.":[144]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-12-21T00:00:00"}
