{"id":"https://openalex.org/W4414908856","doi":"https://doi.org/10.1109/icdcs63083.2025.00019","title":"Bandwidth Optimized Scalable Designs with Inter-Layer Overlapping for MPI Broadcast","display_name":"Bandwidth Optimized Scalable Designs with Inter-Layer Overlapping for MPI Broadcast","publication_year":2025,"publication_date":"2025-07-21","ids":{"openalex":"https://openalex.org/W4414908856","doi":"https://doi.org/10.1109/icdcs63083.2025.00019"},"language":"en","primary_location":{"id":"doi:10.1109/icdcs63083.2025.00019","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icdcs63083.2025.00019","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 45th International Conference on Distributed Computing Systems (ICDCS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100647639","display_name":"Yi Dai","orcid":"https://orcid.org/0000-0003-1219-2436"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yi Dai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101540573","display_name":"Qi Zhu","orcid":"https://orcid.org/0000-0003-4053-080X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi Zhu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103215978","display_name":"Jiaqing Xu","orcid":"https://orcid.org/0000-0002-8555-287X"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiaqing Xu","raw_affiliation_strings":["National University of Defense Technology,College of Computer Science and Technology,Changsha,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,College of Computer Science and Technology,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100367009","display_name":"Qiang Wang","orcid":"https://orcid.org/0000-0002-7078-7545"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiang Wang","raw_affiliation_strings":["National University of Defense Technology,College of Computer Science and Technology,Changsha,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,College of Computer Science and Technology,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000683189","display_name":"Bo Yang","orcid":"https://orcid.org/0000-0002-1108-4006"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Yang","raw_affiliation_strings":["National University of Defense Technology,College of Computer Science and Technology,Changsha,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,College of Computer Science and Technology,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012246988","display_name":"Min Xie","orcid":"https://orcid.org/0000-0003-3065-2918"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Min Xie","raw_affiliation_strings":["National University of Defense Technology,College of Computer Science and Technology,Changsha,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,College of Computer Science and Technology,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5116668183","display_name":"Dongsheng Li","orcid":"https://orcid.org/0000-0001-8143-4449"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongsheng Li","raw_affiliation_strings":["National University of Defense Technology,College of Computer Science and Technology,Changsha,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,College of Computer Science and Technology,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100647639"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.26852658,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"100","last_page":"110"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10187","display_name":"Radio Frequency Integrated Circuit Design","score":0.9787999987602234,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10187","display_name":"Radio Frequency Integrated Circuit Design","score":0.9787999987602234,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.9706000089645386,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.9549000263214111,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7092000246047974},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.5511000156402588},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.49970000982284546},{"id":"https://openalex.org/keywords/message-passing","display_name":"Message passing","score":0.46720001101493835},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.46309998631477356},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.45399999618530273},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.4471000134944916},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.3725000023841858},{"id":"https://openalex.org/keywords/programming-paradigm","display_name":"Programming paradigm","score":0.3707999885082245}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9107999801635742},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7092000246047974},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.580299973487854},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.5511000156402588},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5442000031471252},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.49970000982284546},{"id":"https://openalex.org/C854659","wikidata":"https://www.wikidata.org/wiki/Q1859284","display_name":"Message passing","level":2,"score":0.46720001101493835},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.46309998631477356},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.45399999618530273},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.4471000134944916},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3725000023841858},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.3707999885082245},{"id":"https://openalex.org/C943373","wikidata":"https://www.wikidata.org/wiki/Q4817323","display_name":"Atomic broadcast","level":3,"score":0.3698999881744385},{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.3531000018119812},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3398999869823456},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.3269999921321869},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.3179999887943268},{"id":"https://openalex.org/C166782233","wikidata":"https://www.wikidata.org/wiki/Q127879","display_name":"Message Passing Interface","level":3,"score":0.31139999628067017},{"id":"https://openalex.org/C113174947","wikidata":"https://www.wikidata.org/wiki/Q2859736","display_name":"Tree (set theory)","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2964000105857849},{"id":"https://openalex.org/C1793878","wikidata":"https://www.wikidata.org/wiki/Q1153762","display_name":"Out-of-order execution","level":2,"score":0.2849999964237213},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.28439998626708984},{"id":"https://openalex.org/C137364921","wikidata":"https://www.wikidata.org/wiki/Q27929394","display_name":"Parallel programming model","level":3,"score":0.2653999924659729},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.26460000872612},{"id":"https://openalex.org/C509933004","wikidata":"https://www.wikidata.org/wiki/Q194163","display_name":"Broadband","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C2780870223","wikidata":"https://www.wikidata.org/wiki/Q1004415","display_name":"Runtime system","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icdcs63083.2025.00019","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icdcs63083.2025.00019","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 45th International Conference on Distributed Computing Systems (ICDCS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"MPI":[0,29,103,224],"(Massage":[1],"Passing":[2],"Interface)":[3],"has":[4],"been":[5],"the":[6,20,32,55,73,81,108,117,139,195,206,236,239,247],"dominant":[7],"programming":[8],"model":[9,243],"for":[10,175,235],"developing":[11],"large-scale":[12,144],"parallel":[13],"applications.":[14],"Existing":[15],"work":[16],"mainly":[17],"focuses":[18],"on":[19],"vast":[21],"parallelism":[22],"of":[23,54,83,102,143,197,217,238,244],"modern":[24,37],"multi-/many-cores":[25],"architectures":[26],"to":[27,69,93,155,179,193,219,222,232],"parallelize":[28],"collectives.":[30],"However,":[31],"abundant":[33],"bandwidth":[34,99],"provided":[35],"by":[36,48,160],"interconnects":[38],"is":[39,52,153],"either":[40],"underutilized":[41],"when":[42],"processing":[43],"small":[44],"messages":[45],"or":[46],"overwhelmed":[47],"large":[49],"messages.":[50],"MPI_Bcast":[51,212],"one":[53,67],"most":[56],"widely":[57],"used":[58],"collective":[59],"primitives":[60],"in":[61,72,91,130],"MPI,":[62],"which":[63],"broadcasts":[64],"data":[65,166],"from":[66,87],"process":[68],"all":[70],"processes":[71],"communication":[74],"domain.":[75],"In":[76],"this":[77,198],"paper,":[78],"we":[79,114],"address":[80],"issue":[82],"load":[84,110],"imbalance":[85],"arising":[86],"traditional":[88],"tree-based":[89],"designs":[90,213],"order":[92],"strike":[94],"a":[95,131,148],"better":[96],"balance":[97],"between":[98],"and":[100,133,141,164,186,208],"latency":[101,159],"Broadcast.":[104],"By":[105],"evenly":[106],"distributing":[107],"broadcast":[109,126,158],"across":[111,127],"lower":[112],"layers,":[113],"effectively":[115],"leverage":[116],"available":[118],"resources":[119],"at":[120,183,204],"upper-layer":[121],"nodes":[122],"that":[123],"recursively":[124],"execute":[125],"different":[128],"layers":[129],"sequential":[132],"contention-free":[134],"manner.":[135],"This":[136],"approach":[137],"improves":[138],"scalability":[140],"performance":[142,196,215],"message":[145,185],"broadcasts.":[146],"Additionally,":[147],"generic":[149],"inter-layer":[150,163],"overlapped":[151],"scheme":[152,174],"proposed":[154],"reduce":[156],"overall":[157],"fully":[161],"overlapping":[162],"intra-layer":[165],"transmission.":[167],"We":[168],"further":[169],"implement":[170],"an":[171],"online":[172],"adaptive":[173],"tree":[176],"degree":[177],"tuning":[178],"achieve":[180],"optimal":[181],"design":[182,203],"various":[184],"system":[187],"sizes.":[188],"Extensive":[189],"experiments":[190],"are":[191],"conducted":[192],"evaluate":[194],"Bandwidth-optimized":[199],"Inter-layer":[200],"Overlapping":[201],"(BIO)":[202],"both":[205],"microbenchmark":[207],"application":[209],"levels.":[210],"BIO-based":[211],"demonstrate":[214],"speedups":[216],"up":[218,231],"2.71x":[220],"compared":[221],"state-of-the-art":[223],"libraries.":[225],"For":[226],"application-level":[227],"evaluation,":[228],"BIO":[229],"provides":[230],"165%":[233],"acceleration":[234],"initialization":[237],"distributed":[240],"deep":[241],"learning":[242],"Horovod":[245],"with":[246],"PyTorch":[248],"application.":[249]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
