{"id":"https://openalex.org/W4401687219","doi":"https://doi.org/10.1109/tnet.2024.3441039","title":"Straggler-Aware Gradient Aggregation for Large-Scale Distributed Deep Learning System","display_name":"Straggler-Aware Gradient Aggregation for Large-Scale Distributed Deep Learning System","publication_year":2024,"publication_date":"2024-08-19","ids":{"openalex":"https://openalex.org/W4401687219","doi":"https://doi.org/10.1109/tnet.2024.3441039"},"language":"en","primary_location":{"id":"doi:10.1109/tnet.2024.3441039","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnet.2024.3441039","pdf_url":null,"source":{"id":"https://openalex.org/S62238642","display_name":"IEEE/ACM Transactions on Networking","issn_l":"1063-6692","issn":["1063-6692","1558-2566"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Networking","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059077556","display_name":"Yijun Li","orcid":"https://orcid.org/0000-0003-4335-8742"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yijun Li","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008483730","display_name":"Jiawei Huang","orcid":"https://orcid.org/0000-0002-7578-4490"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiawei Huang","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014213647","display_name":"Zhaoyi Li","orcid":"https://orcid.org/0000-0002-9677-2368"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaoyi Li","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053781404","display_name":"Jingling Liu","orcid":"https://orcid.org/0000-0001-8743-0270"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingling Liu","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102654693","display_name":"Shengwen Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengwen Zhou","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100678702","display_name":"Tao Zhang","orcid":"https://orcid.org/0000-0003-1559-406X"},"institutions":[{"id":"https://openalex.org/I198357462","display_name":"Changsha University","ror":"https://ror.org/011d8sm39","country_code":"CN","type":"education","lineage":["https://openalex.org/I198357462"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Zhang","raw_affiliation_strings":["Hunan Province Key Laboratory of Industrial Internet Technology and Security, Changsha University, Changsha, China"],"affiliations":[{"raw_affiliation_string":"Hunan Province Key Laboratory of Industrial Internet Technology and Security, Changsha University, Changsha, China","institution_ids":["https://openalex.org/I198357462"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027239963","display_name":"Wanchun Jiang","orcid":"https://orcid.org/0000-0001-5067-321X"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wanchun Jiang","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100438360","display_name":"Jianxin Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianxin Wang","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5059077556"],"corresponding_institution_ids":["https://openalex.org/I139660479"],"apc_list":null,"apc_paid":null,"fwci":1.366,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.84007282,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"32","issue":"6","first_page":"4917","last_page":"4930"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12676","display_name":"Machine Learning and ELM","score":0.8956999778747559,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12676","display_name":"Machine Learning and ELM","score":0.8956999778747559,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.795199990272522,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12702","display_name":"Brain Tumor Detection and Classification","score":0.777999997138977,"subfield":{"id":"https://openalex.org/subfields/2808","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5908135771751404},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5668244361877441},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.41895201802253723},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3559790849685669},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.07874047756195068},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.05442383885383606}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5908135771751404},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5668244361877441},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.41895201802253723},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3559790849685669},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.07874047756195068},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.05442383885383606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tnet.2024.3441039","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnet.2024.3441039","pdf_url":null,"source":{"id":"https://openalex.org/S62238642","display_name":"IEEE/ACM Transactions on Networking","issn_l":"1063-6692","issn":["1063-6692","1558-2566"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Networking","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3020578730","display_name":null,"funder_award_id":"62302524","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3358112784","display_name":null,"funder_award_id":"62132022","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320334023","display_name":"Arkansas High Performance Computing Center, University of Arkansas","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W1497601926","https://openalex.org/W2083842231","https://openalex.org/W2143612262","https://openalex.org/W2194775991","https://openalex.org/W2268702383","https://openalex.org/W2523435939","https://openalex.org/W2618530766","https://openalex.org/W2759371073","https://openalex.org/W2761338514","https://openalex.org/W2769986458","https://openalex.org/W2798515322","https://openalex.org/W2919115771","https://openalex.org/W2920397365","https://openalex.org/W2963909453","https://openalex.org/W2969388332","https://openalex.org/W2982664135","https://openalex.org/W3009014747","https://openalex.org/W3022298203","https://openalex.org/W3129895389","https://openalex.org/W3189317289","https://openalex.org/W3189640247","https://openalex.org/W4290991009","https://openalex.org/W4313417875","https://openalex.org/W4327911839","https://openalex.org/W4385357021","https://openalex.org/W4389989295","https://openalex.org/W4392024151","https://openalex.org/W4401352466","https://openalex.org/W6637373629","https://openalex.org/W6679393576","https://openalex.org/W6683738474","https://openalex.org/W6713134421","https://openalex.org/W6730190955","https://openalex.org/W6748645090","https://openalex.org/W6756009870","https://openalex.org/W6758283263","https://openalex.org/W6759814162","https://openalex.org/W6760703128","https://openalex.org/W6774103200","https://openalex.org/W6778729859","https://openalex.org/W6779103662","https://openalex.org/W6780667297","https://openalex.org/W6784425352","https://openalex.org/W6787972765","https://openalex.org/W6792106764","https://openalex.org/W6794216301","https://openalex.org/W6839850428","https://openalex.org/W6855921298"],"related_works":["https://openalex.org/W2731899572","https://openalex.org/W3215138031","https://openalex.org/W3009238340","https://openalex.org/W4321369474","https://openalex.org/W4360585206","https://openalex.org/W4285208911","https://openalex.org/W3082895349","https://openalex.org/W4213079790","https://openalex.org/W2248239756","https://openalex.org/W4323565446"],"abstract_inverted_index":{"Deep":[0],"Neural":[1],"Network":[2],"(DNN)":[3],"is":[4],"a":[5,9,93],"critical":[6],"component":[7],"of":[8,12,19,34,57],"wide":[10],"range":[11],"applications.":[13],"However,":[14],"with":[15],"the":[16,20,28,50,55,74,101,106,114,132],"rapid":[17],"growth":[18],"training":[21,141],"dataset":[22],"and":[23,109,139],"model":[24],"size,":[25],"communication":[26],"becomes":[27],"bottleneck,":[29],"resulting":[30],"in":[31,49,80,152],"low":[32],"utilization":[33],"computing":[35,108],"resources.":[36],"To":[37,87],"accelerate":[38],"communication,":[39],"recent":[40],"works":[41],"propose":[42,92],"to":[43,53,65,84,104,112,137,144],"aggregate":[44,66],"gradients":[45],"from":[46,73],"multiple":[47],"workers":[48],"programmable":[51],"switch":[52],"reduce":[54],"volume":[56],"exchanged":[58],"data.":[59],"Unfortunately,":[60],"since":[61],"using":[62,122],"synchronization":[63],"transmission":[64],"data,":[67],"current":[68],"in-network":[69],"aggregation":[70,95],"designs":[71],"suffer":[72],"straggler":[75],"problem,":[76],"which":[77,99],"often":[78],"occurs":[79],"shared":[81],"clusters":[82,121],"due":[83],"resource":[85],"contention.":[86],"address":[88],"this":[89],"issue,":[90],"we":[91],"straggler-aware":[94],"transport":[96],"protocol":[97],"(SA-ATP),":[98],"enables":[100],"leading":[102],"worker":[103],"leverage":[105],"spare":[107],"storage":[110],"resources":[111],"help":[113],"straggling":[115],"worker.":[116],"We":[117],"implement":[118],"SA-ATP":[119,130],"atop":[120],"P4-programmable":[123],"switches.":[124],"The":[125],"evaluation":[126],"results":[127],"show":[128],"that":[129],"reduces":[131],"iteration":[133],"time":[134],"by":[135,142],"up":[136,143],"57%":[138],"accelerates":[140],"<inline-formula":[145],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[146],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">":[147],"<tex-math":[148],"notation=\"LaTeX\">$1.8\\times":[149],"$":[150],"</tex-math></inline-formula>":[151],"real-world":[153],"benchmark":[154],"models.":[155]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
