{"id":"https://openalex.org/W4210863326","doi":"https://doi.org/10.1109/mm.2022.3148670","title":"Accelerating Deep Learning Using Interconnect-Aware UCX Communication for MPI Collectives","display_name":"Accelerating Deep Learning Using Interconnect-Aware UCX Communication for MPI Collectives","publication_year":2022,"publication_date":"2022-02-07","ids":{"openalex":"https://openalex.org/W4210863326","doi":"https://doi.org/10.1109/mm.2022.3148670"},"language":"en","primary_location":{"id":"doi:10.1109/mm.2022.3148670","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mm.2022.3148670","pdf_url":null,"source":{"id":"https://openalex.org/S59697426","display_name":"IEEE Micro","issn_l":"0272-1732","issn":["0272-1732","1937-4143"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Micro","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081527852","display_name":"Y\u0131ltan Hassan Temu\u00e7in","orcid":"https://orcid.org/0000-0002-4145-4848"},"institutions":[{"id":"https://openalex.org/I204722609","display_name":"Queen's University","ror":"https://ror.org/02y72wh86","country_code":"CA","type":"education","lineage":["https://openalex.org/I204722609"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Yltan Hassan Temucin","raw_affiliation_strings":["Queen&#x2019;s University, Kingston, ON, Canada"],"affiliations":[{"raw_affiliation_string":"Queen&#x2019;s University, Kingston, ON, Canada","institution_ids":["https://openalex.org/I204722609"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031362631","display_name":"Amirhossein Sojoodi","orcid":"https://orcid.org/0000-0001-9877-3201"},"institutions":[{"id":"https://openalex.org/I204722609","display_name":"Queen's University","ror":"https://ror.org/02y72wh86","country_code":"CA","type":"education","lineage":["https://openalex.org/I204722609"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Amir Hossein Sojoodi","raw_affiliation_strings":["Queen&#x2019;s University, Kingston, ON, Canada"],"affiliations":[{"raw_affiliation_string":"Queen&#x2019;s University, Kingston, ON, Canada","institution_ids":["https://openalex.org/I204722609"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090580559","display_name":"Pedram Alizadeh","orcid":null},"institutions":[{"id":"https://openalex.org/I204722609","display_name":"Queen's University","ror":"https://ror.org/02y72wh86","country_code":"CA","type":"education","lineage":["https://openalex.org/I204722609"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Pedram Alizadeh","raw_affiliation_strings":["Queen&#x2019;s University, Kingston, ON, Canada"],"affiliations":[{"raw_affiliation_string":"Queen&#x2019;s University, Kingston, ON, Canada","institution_ids":["https://openalex.org/I204722609"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5098084074","display_name":"Benjamin Kitor","orcid":"https://orcid.org/0000-0002-8497-0578"},"institutions":[{"id":"https://openalex.org/I204722609","display_name":"Queen's University","ror":"https://ror.org/02y72wh86","country_code":"CA","type":"education","lineage":["https://openalex.org/I204722609"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Benjamin Kitor","raw_affiliation_strings":["Queen&#x2019;s University, Kingston, ON, Canada"],"affiliations":[{"raw_affiliation_string":"Queen&#x2019;s University, Kingston, ON, Canada","institution_ids":["https://openalex.org/I204722609"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5039854005","display_name":"Ahmad Afsahi","orcid":"https://orcid.org/0000-0002-2924-6851"},"institutions":[{"id":"https://openalex.org/I204722609","display_name":"Queen's University","ror":"https://ror.org/02y72wh86","country_code":"CA","type":"education","lineage":["https://openalex.org/I204722609"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Ahmad Afsahi","raw_affiliation_strings":["Queen&#x2019;s University, Kingston, ON, Canada"],"affiliations":[{"raw_affiliation_string":"Queen&#x2019;s University, Kingston, ON, Canada","institution_ids":["https://openalex.org/I204722609"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5081527852"],"corresponding_institution_ids":["https://openalex.org/I204722609"],"apc_list":null,"apc_paid":null,"fwci":2.7256,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.89985108,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":"42","issue":"2","first_page":"68","last_page":"76"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.9211883544921875},{"id":"https://openalex.org/keywords/pci-express","display_name":"PCI Express","score":0.7819733619689941},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.7029832601547241},{"id":"https://openalex.org/keywords/graphics-processing-unit","display_name":"Graphics processing unit","score":0.5267079472541809},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.48367103934288025},{"id":"https://openalex.org/keywords/message-passing-interface","display_name":"Message Passing Interface","score":0.47303837537765503},{"id":"https://openalex.org/keywords/network-topology","display_name":"Network topology","score":0.45981061458587646},{"id":"https://openalex.org/keywords/message-passing","display_name":"Message passing","score":0.4396608769893646},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.418239951133728},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.35126280784606934},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.2803819477558136},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.268579363822937},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.22844448685646057}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9211883544921875},{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.7819733619689941},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.7029832601547241},{"id":"https://openalex.org/C2779851693","wikidata":"https://www.wikidata.org/wiki/Q183484","display_name":"Graphics processing unit","level":2,"score":0.5267079472541809},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.48367103934288025},{"id":"https://openalex.org/C166782233","wikidata":"https://www.wikidata.org/wiki/Q127879","display_name":"Message Passing Interface","level":3,"score":0.47303837537765503},{"id":"https://openalex.org/C199845137","wikidata":"https://www.wikidata.org/wiki/Q145490","display_name":"Network topology","level":2,"score":0.45981061458587646},{"id":"https://openalex.org/C854659","wikidata":"https://www.wikidata.org/wiki/Q1859284","display_name":"Message passing","level":2,"score":0.4396608769893646},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.418239951133728},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.35126280784606934},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2803819477558136},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.268579363822937},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.22844448685646057}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/mm.2022.3148670","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mm.2022.3148670","pdf_url":null,"source":{"id":"https://openalex.org/S59697426","display_name":"IEEE Micro","issn_l":"0272-1732","issn":["0272-1732","1937-4143"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Micro","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4257696847","display_name":null,"funder_award_id":"05389-2016","funder_id":"https://openalex.org/F4320334593","funder_display_name":"Natural Sciences and Engineering Research Council of Canada"}],"funders":[{"id":"https://openalex.org/F4320314000","display_name":"Compute Canada","ror":"https://ror.org/03ty8yr27"},{"id":"https://openalex.org/F4320314005","display_name":"Western Canada Research Grid","ror":null},{"id":"https://openalex.org/F4320334593","display_name":"Natural Sciences and Engineering Research Council of Canada","ror":"https://ror.org/01h531d29"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W1962931680","https://openalex.org/W1987274898","https://openalex.org/W2488497244","https://openalex.org/W2777078856","https://openalex.org/W2888980102","https://openalex.org/W2898319404","https://openalex.org/W2903901007","https://openalex.org/W3017432630","https://openalex.org/W3039165326","https://openalex.org/W3103417206","https://openalex.org/W3204371345"],"related_works":["https://openalex.org/W2124048060","https://openalex.org/W4386915331","https://openalex.org/W2378910916","https://openalex.org/W2116006827","https://openalex.org/W2361929291","https://openalex.org/W2086666199","https://openalex.org/W1997862718","https://openalex.org/W1511717675","https://openalex.org/W2185992486","https://openalex.org/W2146057962"],"abstract_inverted_index":{"Deep":[0],"learning":[1,163],"workloads":[2],"on":[3,13,29,93,98],"modern":[4],"multi-graphics":[5],"processing":[6],"unit":[7],"(GPU)":[8],"nodes":[9],"are":[10],"highly":[11],"dependent":[12],"intranode":[14],"interconnects,":[15],"such":[16],"as":[17,116,118],"NVLink":[18,60,99],"and":[19,53,61,70,79,96,100],"PCIe,":[20],"for":[21,49,87,124,150],"high-performance":[22],"communication.":[23],"In":[24],"this":[25,106],"article,":[26],"we":[27],"take":[28],"the":[30,72,151],"challenge":[31],"to":[32,44,82,108,141],"design":[33,115],"an":[34],"interconnect-aware":[35],"multipath":[36,65],"GPU-to-GPU":[37],"communication":[38,41,77],"using":[39],"unified":[40],"X":[42],"(UCX)":[43],"utilize":[45,105],"all":[46],"available":[47],"bandwidth":[48,86],"both":[50],"NVLink-based":[51,94],"systems":[52,95],"those":[54],"that":[55],"use":[56],"a":[57,110,119,136,159],"mixture":[58],"of":[59,161],"PCIe.":[62],"Our":[63],"proposed":[64,133,152],"data":[66],"transfer":[67],"mechanism":[68,107],"pipelines":[69],"stripes":[71],"message":[73,89],"across":[74],"multiple":[75],"intrasocket":[76],"channels":[78],"memory":[80],"regions":[81],"achieve":[83,135],"1.84\u00d7":[84],"higher":[85],"Open":[88],"passing":[90],"interface":[91],"(MPI)":[92],"1.23\u00d7":[97],"PCIe":[101],"systems.":[102],"We":[103,145],"then":[104],"propose":[109],"three-stage":[111],"hierarchical,":[112],"pipelined":[113,121],"MPI_Allreduce":[114,153],"well":[117],"flat":[120],"two-stage":[122],"algorithm":[123],"two":[125],"different":[126],"node":[127],"topologies.":[128],"For":[129],"large":[130],"messages,":[131],"our":[132],"algorithms":[134],"high":[137],"speedup":[138,149],"when":[139],"compared":[140],"other":[142],"MPI":[143],"implementations.":[144],"also":[146],"observe":[147],"significant":[148],"with":[154,158],"Horovod":[155],"+":[156],"TensorFlow":[157],"variety":[160],"deep":[162],"models.":[164]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
