{"id":"https://openalex.org/W7124169995","doi":"https://doi.org/10.1109/icpads67057.2025.11323136","title":"Straggler Dynamic Management for Distributed DNN Training","display_name":"Straggler Dynamic Management for Distributed DNN Training","publication_year":2025,"publication_date":"2025-12-14","ids":{"openalex":"https://openalex.org/W7124169995","doi":"https://doi.org/10.1109/icpads67057.2025.11323136"},"language":null,"primary_location":{"id":"doi:10.1109/icpads67057.2025.11323136","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpads67057.2025.11323136","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 31th International Conference on Parallel and Distributed Systems (ICPADS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123018049","display_name":"Tiance Li","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Tiance Li","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123023401","display_name":"Bo Chai","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Chai","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123022833","display_name":"Xiaobin Tan","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaobin Tan","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123041755","display_name":"Shenzhi Yuan","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shenzhi Yuan","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123025456","display_name":"Kexin Ju","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kexin Ju","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5075106826","display_name":"Shiyin Zhu","orcid":"https://orcid.org/0009-0008-5219-0885"},"institutions":[{"id":"https://openalex.org/I4210093776","display_name":"DHC Software (China)","ror":"https://ror.org/00kn8e190","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210093776"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiyin Zhu","raw_affiliation_strings":["H3C Technologies Co., Limited,Beijing,China"],"affiliations":[{"raw_affiliation_string":"H3C Technologies Co., Limited,Beijing,China","institution_ids":["https://openalex.org/I4210093776"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5123018049"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.69079618,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.5037999749183655,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.5037999749183655,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10080","display_name":"Energy Efficient Wireless Sensor Networks","score":0.05510000139474869,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13553","display_name":"Age of Information Optimization","score":0.03579999879002571,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.7364000082015991},{"id":"https://openalex.org/keywords/adaptability","display_name":"Adaptability","score":0.6624000072479248},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5205000042915344},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.492000013589859},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.4650999903678894},{"id":"https://openalex.org/keywords/instruction-prefetch","display_name":"Instruction prefetch","score":0.43880000710487366},{"id":"https://openalex.org/keywords/dynamic-data","display_name":"Dynamic data","score":0.38600000739097595},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.35569998621940613}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7699999809265137},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.7364000082015991},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.6624000072479248},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5205000042915344},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.492000013589859},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4828999936580658},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.4650999903678894},{"id":"https://openalex.org/C133588205","wikidata":"https://www.wikidata.org/wiki/Q28455645","display_name":"Instruction prefetch","level":3,"score":0.43880000710487366},{"id":"https://openalex.org/C197298091","wikidata":"https://www.wikidata.org/wiki/Q5318963","display_name":"Dynamic data","level":2,"score":0.38600000739097595},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.373199999332428},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.35569998621940613},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.34790000319480896},{"id":"https://openalex.org/C195563490","wikidata":"https://www.wikidata.org/wiki/Q180368","display_name":"Network congestion","level":3,"score":0.33550000190734863},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.28769999742507935},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2800999879837036},{"id":"https://openalex.org/C108734733","wikidata":"https://www.wikidata.org/wiki/Q1172333","display_name":"Data synchronization","level":3,"score":0.272599995136261},{"id":"https://openalex.org/C164660894","wikidata":"https://www.wikidata.org/wiki/Q2037833","display_name":"Piecewise","level":2,"score":0.27239999175071716},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C79487989","wikidata":"https://www.wikidata.org/wiki/Q934680","display_name":"Vehicle dynamics","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C161664118","wikidata":"https://www.wikidata.org/wiki/Q1089933","display_name":"Churning","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icpads67057.2025.11323136","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpads67057.2025.11323136","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 31th International Conference on Parallel and Distributed Systems (ICPADS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G6958582101","display_name":null,"funder_award_id":"62101525,62341113,62021001","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W1497601926","https://openalex.org/W2083842231","https://openalex.org/W2268702383","https://openalex.org/W2769986458","https://openalex.org/W2920397365","https://openalex.org/W2962208489","https://openalex.org/W3189317289","https://openalex.org/W4290991009","https://openalex.org/W4313417875","https://openalex.org/W4327911839","https://openalex.org/W4400448554"],"related_works":[],"abstract_inverted_index":{"Straggler":[0,35],"nodes":[1],"are":[2],"a":[3,67,78,122,131,153],"major":[4],"bottleneck":[5],"in":[6,29,177],"large-scale":[7,43],"distributed":[8,44],"training,":[9],"degrading":[10],"efficiency":[11,173],"and":[12,55,63,71,85,91,107,130,143,149,157,174,192],"stability.":[13],"However,":[14],"current":[15],"solutions,":[16],"including":[17],"In-Network":[18],"Aggregation":[19],"(INA),":[20],"lack":[21],"the":[22,53,83,95,178],"adaptability":[23,193],"to":[24,58,98,126,139],"effectively":[25],"manage":[26],"these":[27],"stragglers":[28,76,186],"dynamic":[30,48],"environments.":[31],"This":[32],"paper":[33],"proposes":[34],"Dynamic":[36],"Management":[37],"(SDM),":[38],"an":[39],"adaptive":[40],"method":[41],"for":[42,88],"training":[45,172],"that":[46,81,135,168],"performs":[47],"straggler":[49,109,163,181],"management":[50],"by":[51],"coordinating":[52],"data":[54,84,96],"control":[56,86,112],"planes":[57,87],"achieve":[59],"accurate,":[60],"time-based":[61],"detection":[62,90],"efficient":[64,89],"mitigation":[65],"via":[66],"performanceaware":[68],"redundancy":[69,124],"strategy":[70],"semi-asynchronous":[72,132],"aggregation.":[73],"SDM":[74,151,169],"manages":[75],"through":[77],"coordinated":[79],"architecture":[80],"decouples":[82],"response.":[92],"It":[93],"leverages":[94],"plane":[97,113],"estimate":[99],"each":[100],"node's":[101],"remaining":[102],"completion":[103],"time,":[104],"ensuring":[105],"accurate":[106],"low-overhead":[108],"identification.":[110],"The":[111],"then":[114],"mitigates":[115],"their":[116],"impact":[117],"using":[118],"two":[119],"key":[120],"strategies:":[121],"performance-aware":[123],"scheme":[125],"reduce":[127],"waiting":[128],"delays,":[129],"aggregation":[133],"mechanism":[134],"dynamically":[136],"adjusts":[137],"synchronization":[138],"alleviate":[140],"gradient":[141],"staleness":[142],"improve":[144],"model":[145],"convergence.":[146],"We":[147],"implement":[148],"deploy":[150],"on":[152],"real-world":[154],"hardware":[155],"testbed":[156],"evaluate":[158],"its":[159],"performance":[160],"under":[161],"various":[162],"scenarios.":[164],"Experimental":[165],"results":[166],"demonstrate":[167],"significantly":[170],"improves":[171],"convergence":[175],"stability":[176],"presence":[179],"of":[180],"nodes,":[182],"particularly":[183],"when":[184],"multiple":[185],"occur":[187],"simultaneously,":[188],"exhibiting":[189],"greater":[190],"robustness":[191],"than":[194],"existing":[195],"methods.":[196]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-01-15T00:00:00"}
