{"id":"https://openalex.org/W7156310530","doi":"https://doi.org/10.48550/arxiv.2604.22228","title":"Accelerating Intra-Node GPU-to-GPU Communication Through Multi-Path Transfers with CUDA Graphs","display_name":"Accelerating Intra-Node GPU-to-GPU Communication Through Multi-Path Transfers with CUDA Graphs","publication_year":2026,"publication_date":"2026-04-24","ids":{"openalex":"https://openalex.org/W7156310530","doi":"https://doi.org/10.48550/arxiv.2604.22228"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.22228","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22228","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.22228","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5031362631","display_name":"Amirhossein Sojoodi","orcid":"https://orcid.org/0000-0001-9877-3201"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sojoodi, Amirhossein","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081527852","display_name":"Y\u0131ltan Hassan Temu\u00e7in","orcid":"https://orcid.org/0000-0002-4145-4848"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Temucin, Yiltan Hassan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134730401","display_name":"Amirreza Baratisedeh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Baratisedeh, Amirreza","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114553985","display_name":"Hamed Sharifian","orcid":"https://orcid.org/0009-0002-2275-3313"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sharifian, Hamed","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5039854005","display_name":"Ahmad Afsahi","orcid":"https://orcid.org/0000-0002-2924-6851"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Afsahi, Ahmad","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5031362631"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.41359999775886536,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.41359999775886536,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.28049999475479126,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.21040000021457672,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.9305999875068665},{"id":"https://openalex.org/keywords/pci-express","display_name":"PCI Express","score":0.7073000073432922},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5397999882698059},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.5152999758720398},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.48170000314712524},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.41029998660087585}],"concepts":[{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.9305999875068665},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8503999710083008},{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.7073000073432922},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5397999882698059},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5239999890327454},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.5152999758720398},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.48170000314712524},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.41029998660087585},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.29679998755455017},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.29660001397132874},{"id":"https://openalex.org/C126831891","wikidata":"https://www.wikidata.org/wiki/Q221673","display_name":"Host (biology)","level":2,"score":0.28610000014305115},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.26339998841285706},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.26080000400543213},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C158156997","wikidata":"https://www.wikidata.org/wiki/Q1416645","display_name":"Models of communication","level":2,"score":0.2547999918460846}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.22228","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22228","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.22228","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22228","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Effective":[0],"intra-node":[1,37],"GPU":[2,40,131],"communication":[3,17,56,66,103],"is":[4,81],"essential":[5],"for":[6],"optimizing":[7,55],"performance":[8],"in":[9,65,118],"MPI-based":[10],"HPC":[11],"applications,":[12],"especially":[13],"when":[14,123],"leveraging":[15,44],"multiple":[16,45],"paths.":[18],"In":[19],"this":[20],"study,":[21],"we":[22,61],"propose":[23],"a":[24,95,108],"novel":[25],"approach":[26,80,104],"that":[27],"integrates":[28],"CUDA":[29,59,87,100],"Graphs":[30,88],"into":[31,89],"the":[32,52,73,82,114,125],"UCX":[33,116],"framework":[34],"to":[35,84,107,113,137],"enhance":[36],"multi-path":[38,102],"point-to-point":[39],"communication.":[41],"By":[42],"concurrently":[43],"paths,":[46,132],"including":[47],"NVLink":[48],"and":[49,54,68,128],"PCIe":[50],"through":[51],"host,":[53],"workflows":[57],"using":[58],"Graph,":[60],"achieve":[62],"significant":[63],"reductions":[64],"overhead":[67],"improve":[69],"execution":[70],"efficiency.":[71],"To":[72],"best":[74],"of":[75],"our":[76,78,98],"knowledge,":[77],"proposed":[79,99],"first":[83],"seamlessly":[85],"integrate":[86],"UCX.":[90],"Through":[91],"extensive":[92],"experiments":[93],"on":[94],"four-GPU":[96],"node,":[97],"Graph-based":[101],"achieves":[105],"up":[106,136],"2.95x":[109],"bandwidth":[110,121],"improvement,":[111],"compared":[112],"single-path":[115],"(UCT::CUDA-IPC),":[117],"GPU-to-GPU":[119],"OMB":[120],"test":[122],"utilizing":[124],"host":[126],"path":[127],"two":[129],"other":[130],"at":[133],"message":[134],"sizes":[135],"512MB.":[138]},"counts_by_year":[],"updated_date":"2026-04-28T06:12:00.211691","created_date":"2026-04-28T00:00:00"}
