{"id":"https://openalex.org/W4414646971","doi":"https://doi.org/10.1109/vtc2025-spring65109.2025.11174740","title":"Cooperative Two-Dimensional Hierarchical All-Reduce for Large Language Model Training with Heterogeneous Communication Links","display_name":"Cooperative Two-Dimensional Hierarchical All-Reduce for Large Language Model Training with Heterogeneous Communication Links","publication_year":2025,"publication_date":"2025-06-17","ids":{"openalex":"https://openalex.org/W4414646971","doi":"https://doi.org/10.1109/vtc2025-spring65109.2025.11174740"},"language":"en","primary_location":{"id":"doi:10.1109/vtc2025-spring65109.2025.11174740","is_oa":false,"landing_page_url":"https://doi.org/10.1109/vtc2025-spring65109.2025.11174740","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 101st Vehicular Technology Conference (VTC2025-Spring)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059667505","display_name":"Xiangming Zhu","orcid":"https://orcid.org/0000-0002-2166-3503"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiangming Zhu","raw_affiliation_strings":["Zhejiang Lab,Hangzhou,China,311121"],"affiliations":[{"raw_affiliation_string":"Zhejiang Lab,Hangzhou,China,311121","institution_ids":["https://openalex.org/I4210123185"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004866559","display_name":"Shanyun Liu","orcid":"https://orcid.org/0009-0003-2534-2330"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shanyun Liu","raw_affiliation_strings":["Zhejiang Lab,Hangzhou,China,311121"],"affiliations":[{"raw_affiliation_string":"Zhejiang Lab,Hangzhou,China,311121","institution_ids":["https://openalex.org/I4210123185"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004769887","display_name":"Jingfei Chang","orcid":"https://orcid.org/0000-0003-0530-6511"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingfei Chang","raw_affiliation_strings":["Zhejiang Lab,Hangzhou,China,311121"],"affiliations":[{"raw_affiliation_string":"Zhejiang Lab,Hangzhou,China,311121","institution_ids":["https://openalex.org/I4210123185"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5008473103","display_name":"Hongyang Chen","orcid":"https://orcid.org/0000-0002-7626-0162"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongyang Chen","raw_affiliation_strings":["Zhejiang Lab,Hangzhou,China,311121"],"affiliations":[{"raw_affiliation_string":"Zhejiang Lab,Hangzhou,China,311121","institution_ids":["https://openalex.org/I4210123185"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5059667505"],"corresponding_institution_ids":["https://openalex.org/I4210123185"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14727659,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9452000260353088,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9452000260353088,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.8180000185966492},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5978000164031982},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.490200012922287},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.48989999294281006},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.44020000100135803},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4120999872684479},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3880000114440918},{"id":"https://openalex.org/keywords/communications-system","display_name":"Communications system","score":0.3458000123500824}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8300999999046326},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.8180000185966492},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5978000164031982},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.490200012922287},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.48989999294281006},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4790000021457672},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.47429999709129333},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.44020000100135803},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4120999872684479},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4000000059604645},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3880000114440918},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.3626999855041504},{"id":"https://openalex.org/C101765175","wikidata":"https://www.wikidata.org/wiki/Q577764","display_name":"Communications system","level":2,"score":0.3458000123500824},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.34549999237060547},{"id":"https://openalex.org/C192126672","wikidata":"https://www.wikidata.org/wiki/Q1068715","display_name":"Telecommunications network","level":2,"score":0.337799996137619},{"id":"https://openalex.org/C158156997","wikidata":"https://www.wikidata.org/wiki/Q1416645","display_name":"Models of communication","level":2,"score":0.304500013589859},{"id":"https://openalex.org/C2992525071","wikidata":"https://www.wikidata.org/wiki/Q50818671","display_name":"Federated learning","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C2779582901","wikidata":"https://www.wikidata.org/wiki/Q21013010","display_name":"Distributed learning","level":2,"score":0.290800005197525},{"id":"https://openalex.org/C2983306500","wikidata":"https://www.wikidata.org/wiki/Q52946","display_name":"Voice communication","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C144986985","wikidata":"https://www.wikidata.org/wiki/Q871236","display_name":"Hierarchical database model","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2587999999523163},{"id":"https://openalex.org/C195563490","wikidata":"https://www.wikidata.org/wiki/Q180368","display_name":"Network congestion","level":3,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/vtc2025-spring65109.2025.11174740","is_oa":false,"landing_page_url":"https://doi.org/10.1109/vtc2025-spring65109.2025.11174740","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 101st Vehicular Technology Conference (VTC2025-Spring)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2057332538","https://openalex.org/W2170796499","https://openalex.org/W2926767350","https://openalex.org/W2981114289","https://openalex.org/W3092260380","https://openalex.org/W4313479568","https://openalex.org/W4321487996","https://openalex.org/W4360831831","https://openalex.org/W4386785058","https://openalex.org/W4387490420","https://openalex.org/W4392903998","https://openalex.org/W4393406935"],"related_works":[],"abstract_inverted_index":{"Parallel":[0],"and":[1,22,59],"distributed":[2],"learning":[3],"frameworks":[4],"are":[5,78],"now":[6],"widely":[7],"employed":[8],"for":[9,47],"the":[10,17,27,33,52,55,60,64,69,82,85],"training":[11,34],"of":[12,54,84],"large":[13,19],"language":[14],"models.":[15],"However,":[16],"extremely":[18],"model":[20],"size":[21,24],"data":[23],"have":[25],"made":[26],"communication":[28,57,62],"a":[29],"bottleneck,":[30],"severely":[31],"reducing":[32],"efficiency.":[35],"In":[36],"this":[37],"paper,":[38],"we":[39],"introduce":[40],"two":[41],"novel":[42],"hierarchical":[43],"allreduce":[44],"algorithms":[45,87],"designed":[46],"2D-torus":[48],"topology.":[49],"By":[50],"exploiting":[51],"cooperation":[53],"intra-cluster":[56],"link":[58,71],"inter-cluster":[61],"link,":[63],"congestion":[65],"bottleneck":[66],"caused":[67],"by":[68],"imbalanced":[70],"bandwidth":[72],"can":[73],"be":[74],"alleviated.":[75],"Numerical":[76],"results":[77],"presented":[79],"to":[80],"demonstrate":[81],"performance":[83],"proposed":[86],"in":[88],"comparison":[89],"with":[90],"existing":[91],"baseline":[92],"algorithms.":[93]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
