{"id":"https://openalex.org/W7148572408","doi":"https://doi.org/10.48550/arxiv.2604.00317","title":"From Skew to Symmetry: Node-Interconnect Multi-Path Balancing with Execution-time Planning for Modern GPU Clusters","display_name":"From Skew to Symmetry: Node-Interconnect Multi-Path Balancing with Execution-time Planning for Modern GPU Clusters","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7148572408","doi":"https://doi.org/10.48550/arxiv.2604.00317"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00317","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00317","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00317","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015189372","display_name":"Jinghan Yao","orcid":"https://orcid.org/0009-0002-7129-9508"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yao, Jinghan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132806339","display_name":"Kaushik Kandadi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kandadi, Kaushik","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103171233","display_name":"Bharath Ramesh","orcid":"https://orcid.org/0000-0002-6430-8587"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ramesh, Bharath","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132812401","display_name":"Hari Subramoni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Subramoni, Hari","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132791212","display_name":"Dhabaleswar K. Panda","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Panda, Dhabaleswar K.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5015189372"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.4377000033855438,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.4377000033855438,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3885999917984009,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.05719999969005585,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/remote-direct-memory-access","display_name":"Remote direct memory access","score":0.7623999714851379},{"id":"https://openalex.org/keywords/skew","display_name":"Skew","score":0.6832000017166138},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.5062000155448914},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4560999870300293},{"id":"https://openalex.org/keywords/jitter","display_name":"Jitter","score":0.44769999384880066},{"id":"https://openalex.org/keywords/infiniband","display_name":"InfiniBand","score":0.42570000886917114},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.4108999967575073},{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.3513999879360199},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.336899995803833}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8697999715805054},{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.7623999714851379},{"id":"https://openalex.org/C43711488","wikidata":"https://www.wikidata.org/wiki/Q7534783","display_name":"Skew","level":2,"score":0.6832000017166138},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.6560999751091003},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.5062000155448914},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.4932999908924103},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4560999870300293},{"id":"https://openalex.org/C134652429","wikidata":"https://www.wikidata.org/wiki/Q1052698","display_name":"Jitter","level":2,"score":0.44769999384880066},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.43959999084472656},{"id":"https://openalex.org/C2781030343","wikidata":"https://www.wikidata.org/wiki/Q922437","display_name":"InfiniBand","level":2,"score":0.42570000886917114},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.4108999967575073},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.3513999879360199},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.336899995803833},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.33059999346733093},{"id":"https://openalex.org/C854659","wikidata":"https://www.wikidata.org/wiki/Q1859284","display_name":"Message passing","level":2,"score":0.3240000009536743},{"id":"https://openalex.org/C55282118","wikidata":"https://www.wikidata.org/wiki/Q252683","display_name":"Snapshot (computer storage)","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.30649998784065247},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.2955999970436096},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C22684755","wikidata":"https://www.wikidata.org/wiki/Q847526","display_name":"Queueing theory","level":2,"score":0.29179999232292175},{"id":"https://openalex.org/C26324664","wikidata":"https://www.wikidata.org/wiki/Q1065525","display_name":"Message queue","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C101765175","wikidata":"https://www.wikidata.org/wiki/Q577764","display_name":"Communications system","level":2,"score":0.2802000045776367},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C2780870223","wikidata":"https://www.wikidata.org/wiki/Q1004415","display_name":"Runtime system","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00317","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00317","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00317","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00317","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"GPU-based":[1],"high-performance":[2],"computing":[3],"clusters":[4],"offer":[5],"unprecedented":[6],"communication":[7,24,60,106,166],"bandwidth":[8,82,197],"through":[9,152],"heterogeneous":[10],"intra-node":[11,120,196],"interconnects":[12],"and":[13,56,65,121,133,155,172,176,186,198,210,220],"inter-node":[14,122,201],"networks.":[15],"However,":[16],"despite":[17],"this":[18,126],"high":[19],"aggregate":[20],"bandwidth,":[21],"many":[22],"real-world":[23],"patterns":[25],"fail":[26],"to":[27,37,113,149,193,204,214],"fully":[28,183],"utilize":[29],"the":[30],"available":[31,119],"hardware.":[32],"Traffic":[33],"skew":[34],"often":[35],"leads":[36],"situations":[38],"where":[39],"a":[40,104,128,138],"small":[41],"subset":[42],"of":[43],"links":[44],"becomes":[45],"oversaturated":[46],"while":[47,227],"others":[48],"remain":[49],"underutilized,":[50],"resulting":[51],"in":[52],"congestion,":[53],"latency":[54],"spikes,":[55],"poor":[57],"scalability.":[58],"Existing":[59],"frameworks":[61],"such":[62],"as":[63,127],"NCCL":[64,209],"MPI":[66,211],"with":[67,101,164,182],"UCX":[68],"typically":[69],"rely":[70],"on":[71,216,222],"static":[72],"fastest-path":[73],"routing":[74],"or":[75],"hashing-based":[76],"multi-rail":[77],"striping,":[78],"which":[79],"leaves":[80],"significant":[81],"unused":[83],"when":[84],"runtime":[85,105],"traffic":[86,112,151],"deviates":[87],"from":[88],"expected":[89],"distributions.":[90],"To":[91],"address":[92],"these":[93],"limitations,":[94],"we":[95],"propose":[96],"NIMBLE":[97,124,190],"(Node-Interconnect":[98],"Multi-path":[99],"Balancing":[100],"Execution-time":[102],"orchestration),":[103],"orchestration":[107],"system":[108,159],"that":[109],"dynamically":[110],"redistributes":[111],"balance":[114],"link":[115],"utilization":[116],"across":[117],"all":[118],"paths.":[123],"formulates":[125],"capacity-normalized":[129],"minimum-congestion":[130],"optimization":[131],"problem":[132],"solves":[134],"it":[135],"efficiently":[136],"using":[137],"multiplicative-weights":[139],"algorithm.":[140],"It":[141,207],"further":[142],"employs":[143],"CUDA-aware":[144],"GPU":[145],"kernel-based":[146],"RDMA":[147],"pipelining":[148],"route":[150],"intermediate":[153],"GPUs":[154],"rail-matched":[156],"NICs.":[157],"The":[158],"is":[160],"endpoint-driven,":[161],"integrates":[162],"transparently":[163],"existing":[165],"libraries":[167],"without":[168],"requiring":[169],"application":[170],"changes,":[171],"preserves":[173],"ordering,":[174],"determinism,":[175],"low":[177],"overhead.":[178],"On":[179],"H100-SXM4":[180],"nodes":[181],"connected":[184],"NVLink":[185],"four":[187],"NDR400":[188],"rails,":[189],"achieves":[191],"up":[192,213],"2.3x":[194],"higher":[195,200],"3.8x":[199],"throughput":[202],"compared":[203],"single-path":[205],"baselines.":[206],"outperforms":[208],"by":[212],"5.2x":[215],"skewed":[217],"All-to-Allv":[218],"workloads":[219],"1.35x":[221],"end-to-end":[223],"LLM":[224],"MoE":[225],"workloads,":[226],"matching":[228],"baseline":[229],"performance":[230],"under":[231],"balanced":[232],"traffic.":[233]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-03T00:00:00"}
