{"id":"https://openalex.org/W2967053072","doi":"https://doi.org/10.1109/lca.2019.2933842","title":"Speeding up Collective Communications Through Inter-GPU Re-Routing","display_name":"Speeding up Collective Communications Through Inter-GPU Re-Routing","publication_year":2019,"publication_date":"2019-07-01","ids":{"openalex":"https://openalex.org/W2967053072","doi":"https://doi.org/10.1109/lca.2019.2933842","mag":"2967053072"},"language":"en","primary_location":{"id":"doi:10.1109/lca.2019.2933842","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2019.2933842","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018379891","display_name":"Kiran Ranganath","orcid":"https://orcid.org/0000-0001-8946-0000"},"institutions":[{"id":"https://openalex.org/I103635307","display_name":"University of California, Riverside","ror":"https://ror.org/03nawhv43","country_code":"US","type":"education","lineage":["https://openalex.org/I103635307"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Kiran Ranganath","raw_affiliation_strings":["University of California Riverside, Riverside, USA"],"raw_orcid":"https://orcid.org/0000-0001-8946-0000","affiliations":[{"raw_affiliation_string":"University of California Riverside, Riverside, USA","institution_ids":["https://openalex.org/I103635307"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070283092","display_name":"AmirAli Abdolrashidi","orcid":"https://orcid.org/0000-0003-4753-7481"},"institutions":[{"id":"https://openalex.org/I103635307","display_name":"University of California, Riverside","ror":"https://ror.org/03nawhv43","country_code":"US","type":"education","lineage":["https://openalex.org/I103635307"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"AmirAli Abdolrashidi","raw_affiliation_strings":["University of California Riverside, Riverside, USA"],"raw_orcid":"https://orcid.org/0000-0003-4753-7481","affiliations":[{"raw_affiliation_string":"University of California Riverside, Riverside, USA","institution_ids":["https://openalex.org/I103635307"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043209884","display_name":"Shuaiwen Leon Song","orcid":"https://orcid.org/0000-0002-8402-1436"},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"The University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Shuaiwen Leon Song","raw_affiliation_strings":["University of Sydney, Camperdown, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Sydney, Camperdown, Australia","institution_ids":["https://openalex.org/I129604602"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5000712719","display_name":"Daniel Wong","orcid":"https://orcid.org/0000-0002-5376-7868"},"institutions":[{"id":"https://openalex.org/I103635307","display_name":"University of California, Riverside","ror":"https://ror.org/03nawhv43","country_code":"US","type":"education","lineage":["https://openalex.org/I103635307"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Daniel Wong","raw_affiliation_strings":["University of California Riverside, Riverside, USA"],"raw_orcid":"https://orcid.org/0000-0002-5376-7868","affiliations":[{"raw_affiliation_string":"University of California Riverside, Riverside, USA","institution_ids":["https://openalex.org/I103635307"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5018379891"],"corresponding_institution_ids":["https://openalex.org/I103635307"],"apc_list":null,"apc_paid":null,"fwci":1.7279,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.84111913,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":"18","issue":"2","first_page":"128","last_page":"131"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pci-express","display_name":"PCI Express","score":0.8930379152297974},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8687278628349304},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.6606522798538208},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.5100546479225159},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.43442028760910034},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.4276730716228485},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.42570990324020386},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.41366150975227356},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.40877407789230347},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.3916906416416168},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.3720352053642273},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.34469008445739746},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.3010481297969818},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.1345706582069397},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.11200210452079773}],"concepts":[{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.8930379152297974},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8687278628349304},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.6606522798538208},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.5100546479225159},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.43442028760910034},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.4276730716228485},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.42570990324020386},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.41366150975227356},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.40877407789230347},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3916906416416168},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3720352053642273},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.34469008445739746},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3010481297969818},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.1345706582069397},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.11200210452079773},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lca.2019.2933842","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2019.2933842","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5200347334","display_name":null,"funder_award_id":"DE-AC05-76RL01830","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G8307198512","display_name":null,"funder_award_id":"CCF-1815643","funder_id":"https://openalex.org/F4320335353","funder_display_name":"National Science Foundation of Sri Lanka"},{"id":"https://openalex.org/G8679700530","display_name":null,"funder_award_id":"66150","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"}],"funders":[{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320332550","display_name":"University of California, Riverside","ror":"https://ror.org/03nawhv43"},{"id":"https://openalex.org/F4320335353","display_name":"National Science Foundation of Sri Lanka","ror":"https://ror.org/010xaa060"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W1686810756","https://openalex.org/W2097117768","https://openalex.org/W2097643185","https://openalex.org/W2125551452","https://openalex.org/W2143303748","https://openalex.org/W2163605009","https://openalex.org/W2194775991","https://openalex.org/W2482039329","https://openalex.org/W2528745529","https://openalex.org/W2740001873","https://openalex.org/W2793315650","https://openalex.org/W2794670651","https://openalex.org/W2808274640","https://openalex.org/W2899071864","https://openalex.org/W2926767350","https://openalex.org/W3101708369","https://openalex.org/W4235366964","https://openalex.org/W6637373629","https://openalex.org/W6684191040","https://openalex.org/W6728089085","https://openalex.org/W6756009870"],"related_works":["https://openalex.org/W17155033","https://openalex.org/W4385894176","https://openalex.org/W2347371119","https://openalex.org/W3207760230","https://openalex.org/W1496222301","https://openalex.org/W1590307681","https://openalex.org/W2536018345","https://openalex.org/W2612768808","https://openalex.org/W4312814274","https://openalex.org/W4285370786"],"abstract_inverted_index":{"In":[0],"order":[1],"to":[2,46,57,61,66,121,128,138,174,197,207,244],"address":[3],"the":[4,79,93,108,124,148,218,223,227,232],"vast":[5],"needs":[6],"of":[7,25,110,131,167,222,242],"disparate":[8],"domains,":[9],"computing":[10],"engines":[11],"are":[12,42,144,154],"becoming":[13,43,96],"more":[14,36,97,179],"sophisticated":[15],"and":[16,49,76,82,88,102,151,182,200,211],"complex.":[17],"A":[18],"typical":[19],"high-performance":[20],"computational":[21,125],"engine":[22],"is":[23],"composed":[24],"several":[26],"accelerator":[27],"units,":[28],"in":[29,247],"most":[30],"cases":[31],"GPUs,":[32],"plus":[33],"one":[34],"or":[35,65],"CPU":[37],"controllers.":[38],"All":[39],"these":[40,58],"components":[41],"increasingly":[44],"interconnected":[45],"satisfy":[47],"bandwidth":[48,176,225],"latency":[50],"tolerance":[51],"demands":[52],"from":[53],"modern":[54],"workloads.":[55],"Due":[56],"constraints,":[59],"solutions":[60],"efficiently":[62],"interconnect":[63],"them":[64],"systematically":[67],"manage":[68],"their":[69],"traffic-such":[70],"as":[71,107],"PCIe":[72,198,213,233],"v3,":[73],"NVLink":[74,192,209,224],"v1":[75],"v2":[77],"on":[78,92],"hardware":[80,132],"side,":[81],"NVIDIA":[83],"Collective":[84],"Communication":[85],"Library":[86],"(NCCL)":[87],"AMD":[89],"ROCM":[90],"layer":[91],"software":[94],"side-are":[95],"commonplace":[98],"inside":[99],"HPC":[100],"systems":[101],"cloud":[103],"data":[104],"centers.":[105],"However,":[106],"number":[109],"accelerators":[111],"increases,":[112],"workloads":[113,254],"(especially":[114],"machine":[115,252],"learning)":[116],"might":[117],"not":[118,145],"be":[119],"able":[120],"fully":[122],"exploit":[123],"substrate":[126],"due":[127],"inefficient":[129],"use":[130],"interconnects.":[133],"Such":[134,215],"scenarios":[135],"can":[136],"lead":[137],"performance":[139],"bottlenecks":[140],"where":[141],"high-bandwidth":[142],"links":[143,153],"used":[146],"by":[147],"underlying":[149],"libraries":[150],"under-performing":[152],"overused.":[155],"This":[156],"work":[157],"proposes":[158],"Workload":[159],"Optimization":[160],"Through":[161],"Inter-GPU":[162],"Re-routing":[163],"(WOTIR),":[164],"which":[165],"consists":[166],"enhanced":[168],"NCCL-based":[169],"collective":[170],"primitives":[171],"that":[172],"aim":[173],"boost":[175],"utilization":[177,221],"(through":[178],"efficient":[180],"routing)":[181],"reduce":[183],"communication":[184,193,203],"overhead.":[185],"WOTIR":[186],"targets":[187],"GPUs":[188,206,228],"with":[189],"no":[190],"direct":[191],"path":[194],"(which":[195],"leads":[196],"communications)":[199],"instead":[201],"re-routes":[202],"through":[204,231],"intermediate":[205],"bridge":[208],"segments":[210],"avoid":[212],"communications.":[214],"method":[216],"allows":[217],"maximum":[219],"possible":[220],"between":[226],"without":[229],"routing":[230],"bus.":[234],"Using":[235],"this":[236],"method,":[237],"we":[238],"see":[239],"a":[240],"reduction":[241],"up":[243],"34":[245],"percent":[246],"execution":[248],"time":[249],"for":[250],"selected":[251],"learning":[253],"when":[255],"non-optimal":[256],"GPU":[257],"allocations":[258],"arise.":[259]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":6}],"updated_date":"2026-05-03T08:25:01.440150","created_date":"2025-10-10T00:00:00"}
