{"id":"https://openalex.org/W4376505605","doi":"https://doi.org/10.1145/3597204","title":"Rise of Distributed Deep Learning Training in the Big Model Era: From a Software Engineering Perspective","display_name":"Rise of Distributed Deep Learning Training in the Big Model Era: From a Software Engineering Perspective","publication_year":2023,"publication_date":"2023-05-13","ids":{"openalex":"https://openalex.org/W4376505605","doi":"https://doi.org/10.1145/3597204"},"language":"en","primary_location":{"id":"doi:10.1145/3597204","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3597204","pdf_url":null,"source":{"id":"https://openalex.org/S142627899","display_name":"ACM Transactions on Software Engineering and Methodology","issn_l":"1049-331X","issn":["1049-331X","1557-7392"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Software Engineering and Methodology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052249316","display_name":"Xuanzhe Liu","orcid":"https://orcid.org/0000-0002-7908-8484"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xuanzhe Liu","raw_affiliation_strings":["Peking University, China"],"affiliations":[{"raw_affiliation_string":"Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038736328","display_name":"Diandian Gu","orcid":"https://orcid.org/0000-0002-3591-3892"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Diandian Gu","raw_affiliation_strings":["Peking University, China"],"affiliations":[{"raw_affiliation_string":"Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031457464","display_name":"Zhenpeng Chen","orcid":"https://orcid.org/0000-0002-4765-1893"},"institutions":[{"id":"https://openalex.org/I45129253","display_name":"University College London","ror":"https://ror.org/02jx3x895","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I45129253"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Zhenpeng Chen","raw_affiliation_strings":["University College London, UK"],"affiliations":[{"raw_affiliation_string":"University College London, UK","institution_ids":["https://openalex.org/I45129253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088948176","display_name":"Jinfeng Wen","orcid":"https://orcid.org/0000-0003-3023-1005"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinfeng Wen","raw_affiliation_strings":["Peking University, China"],"affiliations":[{"raw_affiliation_string":"Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100603108","display_name":"Zili Zhang","orcid":"https://orcid.org/0000-0003-4209-9451"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zili Zhang","raw_affiliation_strings":["Peking University, China"],"affiliations":[{"raw_affiliation_string":"Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101614814","display_name":"Yun Ma","orcid":"https://orcid.org/0000-0001-7866-4075"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yun Ma","raw_affiliation_strings":["Peking University, China"],"affiliations":[{"raw_affiliation_string":"Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115695530","display_name":"Haoyu Wang","orcid":"https://orcid.org/0000-0003-1100-8633"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoyu Wang","raw_affiliation_strings":["Huazhong University of Science and Technology, China"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101882004","display_name":"Xin Jin","orcid":"https://orcid.org/0000-0001-8741-5847"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Jin","raw_affiliation_strings":["Peking University, China"],"affiliations":[{"raw_affiliation_string":"Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5052249316"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":1.389,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.81771989,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":"32","issue":"6","first_page":"1","last_page":"26"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9901000261306763,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8196455240249634},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.629848837852478},{"id":"https://openalex.org/keywords/debugging","display_name":"Debugging","score":0.5674893260002136},{"id":"https://openalex.org/keywords/software-engineering","display_name":"Software engineering","score":0.5235971212387085},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5136231780052185},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.4964337944984436},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.4752918481826782},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.4515898525714874},{"id":"https://openalex.org/keywords/software-fault-tolerance","display_name":"Software fault tolerance","score":0.4225688576698303},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4077792167663574},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3934701383113861},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3700001835823059},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.3420126736164093},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.18321743607521057}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8196455240249634},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.629848837852478},{"id":"https://openalex.org/C168065819","wikidata":"https://www.wikidata.org/wiki/Q845566","display_name":"Debugging","level":2,"score":0.5674893260002136},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.5235971212387085},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5136231780052185},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.4964337944984436},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.4752918481826782},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.4515898525714874},{"id":"https://openalex.org/C50712370","wikidata":"https://www.wikidata.org/wiki/Q4269346","display_name":"Software fault tolerance","level":3,"score":0.4225688576698303},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4077792167663574},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3934701383113861},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3700001835823059},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.3420126736164093},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.18321743607521057}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3597204","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3597204","pdf_url":null,"source":{"id":"https://openalex.org/S142627899","display_name":"ACM Transactions on Software Engineering and Methodology","issn_l":"1049-331X","issn":["1049-331X","1557-7392"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Software Engineering and Methodology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4300000071525574,"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9"}],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1179278725","display_name":null,"funder_award_id":"62172008 and 62102009","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1231421488","display_name":null,"funder_award_id":"under","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2802911279","display_name":null,"funder_award_id":"Young","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G391238517","display_name":null,"funder_award_id":", and","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4497957390","display_name":null,"funder_award_id":"62172008","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7719771593","display_name":null,"funder_award_id":"62102009","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335581","display_name":"Young Scientists Fund","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2039676055","https://openalex.org/W2053154970","https://openalex.org/W2104577574","https://openalex.org/W2113175552","https://openalex.org/W2149490731","https://openalex.org/W2164777277","https://openalex.org/W2582956876","https://openalex.org/W2751343396","https://openalex.org/W2787998955","https://openalex.org/W2788193959","https://openalex.org/W2924425593","https://openalex.org/W2930508541","https://openalex.org/W2968594320","https://openalex.org/W2975712713","https://openalex.org/W2981937105","https://openalex.org/W3092854162","https://openalex.org/W3138798301","https://openalex.org/W3206636350","https://openalex.org/W4213412050","https://openalex.org/W4233671654","https://openalex.org/W4233730851","https://openalex.org/W4287991080","https://openalex.org/W4290991168","https://openalex.org/W4299563208","https://openalex.org/W4301239768","https://openalex.org/W6771893701"],"related_works":["https://openalex.org/W2971479921","https://openalex.org/W3145923041","https://openalex.org/W2946906624","https://openalex.org/W841176518","https://openalex.org/W2101077206","https://openalex.org/W2157727563","https://openalex.org/W2470343202","https://openalex.org/W1488443159","https://openalex.org/W2343719514","https://openalex.org/W1504391205"],"abstract_inverted_index":{"Deep":[0],"learning":[1,69,104],"(DL)":[2],"has":[3,56],"become":[4,57],"a":[5,58,90,202],"key":[6],"component":[7],"of":[8,21,92,137,206,244,255,314],"modern":[9],"software.":[10],"In":[11],"the":[12,18,37,45,62,81,95,113,128,133,150,155,210,233,245,256,265,278,289,312],"\u201c":[13],"big":[14,73],"model":[15],"\u201d":[16],"era,":[17],"rich":[19],"features":[20],"DL-based":[22,139,283],"software":[23,64,129,140],"(i.e.,":[24],"DL":[25,31,54,172],"software)":[26],"substantially":[27],"rely":[28],"on":[29,44,141,159,170,195,264,271,288],"powerful":[30,46],"models,":[32,70,74],"e.g.,":[33],"BERT,":[34],"GPT-3,":[35],"and":[36,79,83,153,182,184,198,213,228,291,304,326],"recently":[38],"emerging":[39],"GPT-4,":[40],"which":[41,98],"are":[42,258],"trained":[43],"cloud":[47,331],"with":[48,311],"large":[49],"datasets.":[50],"Hence,":[51],"training":[52,67,96,105,109,121,177,280],"effective":[53],"models":[55],"vital":[59],"stage":[60],"in":[61,94,119,127,149,162],"whole":[63],"lifecycle.":[65],"When":[66],"deep":[68,103],"especially":[71],"those":[72],"developers":[75,117],"need":[76],"to":[77,147,240,260,281,323],"parallelize":[78],"distribute":[80],"computation":[82],"memory":[84],"resources":[85],"amongst":[86],"multiple":[87],"devices":[88],"(e.g.,":[89],"cluster":[91],"GPUs)":[93],"process,":[97],"is":[99],"known":[100],"as":[101,286],"distributed":[102,108,120,142,163,176,279],",":[106],"or":[107,298],"for":[110,218,307,330],"short.":[111],"However,":[112],"unique":[114],"challenges":[115],"that":[116,174,274],"encounter":[118],"process":[122],"have":[123,248],"not":[124],"been":[125],"studied":[126],"engineering":[130],"community.":[131],"Given":[132],"increasingly":[134],"heavy":[135],"dependence":[136],"current":[138],"training,":[143],"this":[144,166],"paper":[145],"aims":[146],"fill":[148],"knowledge":[151],"gap":[152],"presents":[154],"first":[156],"comprehensive":[157],"study":[158],"developers\u2019":[160,188],"issues":[161,189],"training.":[164],"To":[165],"end,":[167],"we":[168,267],"focus":[169],"popular":[171],"frameworks":[173,193],"support":[175],"(including":[178],"TensorFlow,":[179],"PyTorch,":[180],"Keras,":[181],"Horovod)":[183],"analyze":[185],"1,131":[186],"real-world":[187],"about":[190,253],"using":[191],"these":[192],"reported":[194],"Stack":[196],"Overflow":[197],"GitHub.":[199],"We":[200,221],"construct":[201],"fine-grained":[203],"taxonomy":[204],"consisting":[205],"30":[207],"categories":[208],"regarding":[209],"fault":[211,235,246],"symptoms":[212,247],"summarize":[214],"common":[215,292],"fix":[216,250,293],"patterns":[217,294],"different":[219],"symptoms.":[220],"find":[222],"that:":[223],"(1)":[224],"many":[225],"distributed-specific":[226],"faults":[227,230,257],"non-distributed-specific":[229],"inherently":[231],"share":[232],"same":[234],"symptoms,":[236],"making":[237],"it":[238],"challenging":[239],"debug;":[241],"(2)":[242],"most":[243],"frequent":[249,290],"patterns;":[251],"(3)":[252],"half":[254],"related":[259],"system-level":[261],"configurations.":[262],"Based":[263],"results,":[266],"suggest":[268],"actionable":[269],"implications":[270],"research":[272],"avenues":[273],"can":[275],"potentially":[276],"facilitate":[277],"develop":[282],"software,":[284],"such":[285],"focusing":[287],"when":[295],"designing":[296,318,327],"testing":[297,303],"debugging":[299,305],"tools,":[300],"developing":[301],"efficient":[302],"techniques":[306,322],"communication":[308],"configuration":[309,316],"along":[310],"synthesis":[313],"network":[315],"analysis,":[317],"new":[319],"multi-device":[320],"checkpoint-and-replay":[321],"help":[324],"reproduction,":[325],"serverless":[328],"APIs":[329],"platforms.":[332]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2025-10-10T00:00:00"}
