{"id":"https://openalex.org/W4413967041","doi":"https://doi.org/10.1109/tpds.2025.3605491","title":"DynPipe: Toward Dynamic End-to-End Pipeline Parallelism for Interference-Aware DNN Training","display_name":"DynPipe: Toward Dynamic End-to-End Pipeline Parallelism for Interference-Aware DNN Training","publication_year":2025,"publication_date":"2025-09-03","ids":{"openalex":"https://openalex.org/W4413967041","doi":"https://doi.org/10.1109/tpds.2025.3605491"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2025.3605491","is_oa":true,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3605491","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1109/tpds.2025.3605491","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhengyi Yuan","orcid":"https://orcid.org/0009-0000-6532-7176"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhengyi Yuan","raw_affiliation_strings":["National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115596416","display_name":"Xiong Wang","orcid":"https://orcid.org/0000-0001-5360-0454"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiong Wang","raw_affiliation_strings":["National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069200204","display_name":"Yufeng Nie","orcid":"https://orcid.org/0000-0001-7881-5806"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuntao Nie","raw_affiliation_strings":["National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048609852","display_name":"Yufei Tao","orcid":"https://orcid.org/0000-0003-3883-5452"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yufei Tao","raw_affiliation_strings":["National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100379940","display_name":"Yuqing Li","orcid":"https://orcid.org/0000-0003-0816-5777"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuqing Li","raw_affiliation_strings":["School of Cyber Science and Engineering, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"School of Cyber Science and Engineering, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066761150","display_name":"Zhiyuan Shao","orcid":"https://orcid.org/0000-0003-2139-6465"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyuan Shao","raw_affiliation_strings":["National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022398389","display_name":"Xiaofei Liao","orcid":"https://orcid.org/0000-0001-6302-813X"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaofei Liao","raw_affiliation_strings":["National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100374506","display_name":"Bo Li","orcid":"https://orcid.org/0000-0003-2083-9105"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Bo Li","raw_affiliation_strings":["Department of Computer Science and Engineering, Hong Kong University of Science and Technology, Clear Water Bay, Hong Kong","Department of Computer Science and Engineering, Hong Kong University of Science and Technology, Hong Kong"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Hong Kong University of Science and Technology, Clear Water Bay, Hong Kong","institution_ids":["https://openalex.org/I200769079"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, Hong Kong University of Science and Technology, Hong Kong","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"last","author":{"id":null,"display_name":"Hai Jin","orcid":"https://orcid.org/0000-0002-3934-7605"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hai Jin","raw_affiliation_strings":["National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Big Data Technology and System, Services Computing Technology and System Lab/Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I47720641"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.21974183,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"36","issue":"11","first_page":"2366","last_page":"2382"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10444","display_name":"Context-Aware Activity Recognition Systems","score":0.9541000127792358,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10444","display_name":"Context-Aware Activity Recognition Systems","score":0.9541000127792358,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.946399986743927,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.9440000057220459,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8419526815414429},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7684789299964905},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.6847999095916748},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.5663120150566101},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5112042427062988},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4973750412464142},{"id":"https://openalex.org/keywords/interference","display_name":"Interference (communication)","score":0.4716041684150696},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.3536481261253357},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.23374119400978088},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.19468757510185242},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.0890592634677887}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8419526815414429},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7684789299964905},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.6847999095916748},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.5663120150566101},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5112042427062988},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4973750412464142},{"id":"https://openalex.org/C32022120","wikidata":"https://www.wikidata.org/wiki/Q797225","display_name":"Interference (communication)","level":3,"score":0.4716041684150696},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3536481261253357},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.23374119400978088},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.19468757510185242},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.0890592634677887},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2025.3605491","is_oa":true,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3605491","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1109/tpds.2025.3605491","is_oa":true,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3605491","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1566289585","https://openalex.org/W2194775991","https://openalex.org/W2896457183","https://openalex.org/W2951341874","https://openalex.org/W2965658867","https://openalex.org/W2969388332","https://openalex.org/W2975712713","https://openalex.org/W3081168214","https://openalex.org/W3132107458","https://openalex.org/W3157306683","https://openalex.org/W3188270315","https://openalex.org/W3204998121","https://openalex.org/W4213251304","https://openalex.org/W4214549129","https://openalex.org/W4226183928","https://openalex.org/W4288079579","https://openalex.org/W4290991121","https://openalex.org/W4310282800","https://openalex.org/W4311209912","https://openalex.org/W4318541676","https://openalex.org/W4321487996","https://openalex.org/W4327694855","https://openalex.org/W4383749415","https://openalex.org/W4384154386","https://openalex.org/W4385245566","https://openalex.org/W4387302750","https://openalex.org/W4388031315","https://openalex.org/W4388031348","https://openalex.org/W4394998727","https://openalex.org/W4399073645","https://openalex.org/W4400762160","https://openalex.org/W4404181242","https://openalex.org/W4405418476","https://openalex.org/W4411403514"],"related_works":["https://openalex.org/W2151749779","https://openalex.org/W3179968364","https://openalex.org/W2045183646","https://openalex.org/W2162409446","https://openalex.org/W1999612375","https://openalex.org/W2938107654","https://openalex.org/W2109463584","https://openalex.org/W2504075107","https://openalex.org/W3037187668","https://openalex.org/W2033862586"],"abstract_inverted_index":{"Pipeline":[0],"parallelism":[1],"has":[2],"emerged":[3],"as":[4,137],"an":[5,60,102],"indispensable":[6],"technique":[7],"for":[8],"training":[9,73,145],"large":[10],"deep":[11],"neural":[12],"networks.":[13],"While":[14],"existing":[15],"asynchronous":[16,64],"pipeline":[17,65,104,169],"systems":[18],"address":[19],"the":[20,69,84,108,131,144,148,183],"time":[21],"bubbles":[22],"inherent":[23],"in":[24,75,171],"synchronous":[25],"architectures,":[26],"they":[27],"continue":[28],"to":[29,39,46,67,129,158],"suffer":[30],"from":[31],"<italic":[32,36,40,50,61,70,77,85,91,118,153,186],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[33,37,41,51,62,71,78,86,92,119,154,187],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">inefficiency</i>":[34],"and":[35,49,90,140,162],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">susceptibility</i>":[38],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">volatile</i>":[42],"hardware":[43,109],"environment":[44],"due":[45],"their":[47],"suboptimal":[48],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">static</i>":[52],"configurations.":[53],"In":[54],"this":[55],"paper,":[56],"we":[57],"propose":[58],"DynPipe,":[59],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">interference-aware</i>":[63],"framework":[66],"optimize":[68],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">end-to-end</i>":[72],"performance":[74],"highly":[76],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">dynamic</i>":[79],"computing":[80],"environments.":[81,173],"By":[82],"characterizing":[83],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">non-overlapped</i>":[87],"communication":[88],"overheads":[89],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">convergence</i>":[93],"rate":[94],"conditioned":[95],"on":[96,143],"stage-wise":[97],"staleness,":[98],"DynPipe":[99,115,151,178],"carefully":[100],"crafts":[101],"optimized":[103],"partition":[105,156],"that":[106,124,177],"harmonizes":[107],"speed":[110],"with":[111],"statistical":[112],"convergence.":[113],"Moreover,":[114],"deploys":[116],"a":[117],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">non-intrusive</i>":[120],"random":[121],"forest":[122],"model":[123],"utilizes":[125],"runtime":[126],"stage":[127],"statistics":[128],"evaluate":[130],"impact":[132],"of":[133],"environmental":[134],"changes,":[135],"such":[136],"task":[138],"interference":[139],"network":[141],"jitter,":[142],"efficiency.":[146],"Following":[147],"evaluation":[149],"guidance,":[150],"adaptively":[152],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">adjusts</i>":[155],"plan":[157],"restore":[159],"both":[160],"intra":[161],"inter-stage":[163],"load":[164],"balancing,":[165],"thereby":[166],"facilitating":[167],"seamless":[168],"reconfiguration":[170],"dynamic":[172],"Extensive":[174],"experiments":[175],"show":[176],"outperforms":[179],"state-of-the-art":[180],"systems,":[181],"accelerating":[182],"time-to-accuracy":[184],"by":[185],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1.5-3.4\u00d7</i>.":[188]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-09-04T00:00:00"}
