{"id":"https://openalex.org/W4327694855","doi":"https://doi.org/10.1109/tpds.2023.3247001","title":"Merak: An Efficient Distributed DNN Training Framework With Automated 3D Parallelism for Giant Foundation Models","display_name":"Merak: An Efficient Distributed DNN Training Framework With Automated 3D Parallelism for Giant Foundation Models","publication_year":2023,"publication_date":"2023-03-17","ids":{"openalex":"https://openalex.org/W4327694855","doi":"https://doi.org/10.1109/tpds.2023.3247001"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2023.3247001","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2023.3247001","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5055426388","display_name":"Zhiquan Lai","orcid":"https://orcid.org/0000-0002-3458-4732"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiquan Lai","raw_affiliation_strings":["National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China"],"raw_orcid":"https://orcid.org/0000-0002-3458-4732","affiliations":[{"raw_affiliation_string":"National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106623069","display_name":"Shengwei Li","orcid":"https://orcid.org/0000-0002-7419-1511"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengwei Li","raw_affiliation_strings":["National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China"],"raw_orcid":"https://orcid.org/0000-0002-7419-1511","affiliations":[{"raw_affiliation_string":"National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064941010","display_name":"Xudong Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xudong Tang","raw_affiliation_strings":["National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074708780","display_name":"Keshi Ge","orcid":"https://orcid.org/0000-0002-0669-6892"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Keshi Ge","raw_affiliation_strings":["National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China"],"raw_orcid":"https://orcid.org/0000-0002-0669-6892","affiliations":[{"raw_affiliation_string":"National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106623049","display_name":"Weijie Liu","orcid":"https://orcid.org/0000-0002-0354-9467"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weijie Liu","raw_affiliation_strings":["National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China"],"raw_orcid":"https://orcid.org/0000-0002-0354-9467","affiliations":[{"raw_affiliation_string":"National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054823053","display_name":"Yabo Duan","orcid":"https://orcid.org/0009-0002-8370-1277"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yabo Duan","raw_affiliation_strings":["National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043045076","display_name":"Linbo Qiao","orcid":"https://orcid.org/0000-0002-8285-2738"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Linbo Qiao","raw_affiliation_strings":["National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China"],"raw_orcid":"https://orcid.org/0000-0002-8285-2738","affiliations":[{"raw_affiliation_string":"National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100440903","display_name":"Dongsheng Li","orcid":"https://orcid.org/0000-0001-9743-2034"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongsheng Li","raw_affiliation_strings":["National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China"],"raw_orcid":"https://orcid.org/0000-0001-9743-2034","affiliations":[{"raw_affiliation_string":"National Laboratory for Parallel and Distributed Processing, College of Computer, National University of Defense Technology, Changsha, Hunan, China","institution_ids":["https://openalex.org/I170215575"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I170215575"],"apc_list":null,"apc_paid":null,"fwci":6.6666,"has_fulltext":false,"cited_by_count":60,"citation_normalized_percentile":{"value":0.97809069,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"34","issue":"5","first_page":"1466","last_page":"1478"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.986299991607666,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.861527681350708},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.6335457563400269},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.543952465057373},{"id":"https://openalex.org/keywords/spmd","display_name":"SPMD","score":0.4649898409843445},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4445228576660156},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4443073868751526},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.440822571516037},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.41940200328826904},{"id":"https://openalex.org/keywords/massively-parallel","display_name":"Massively parallel","score":0.41502779722213745},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39161086082458496},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.13974648714065552}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.861527681350708},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.6335457563400269},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.543952465057373},{"id":"https://openalex.org/C7042729","wikidata":"https://www.wikidata.org/wiki/Q2289219","display_name":"SPMD","level":2,"score":0.4649898409843445},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4445228576660156},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4443073868751526},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.440822571516037},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.41940200328826904},{"id":"https://openalex.org/C190475519","wikidata":"https://www.wikidata.org/wiki/Q544384","display_name":"Massively parallel","level":2,"score":0.41502779722213745},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39161086082458496},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.13974648714065552},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2023.3247001","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2023.3247001","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3606055544","display_name":null,"funder_award_id":"62025208","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":64,"referenced_works":["https://openalex.org/W2057332538","https://openalex.org/W2338908902","https://openalex.org/W2896457183","https://openalex.org/W2969388332","https://openalex.org/W2973727699","https://openalex.org/W2979826702","https://openalex.org/W2991615366","https://openalex.org/W3081168214","https://openalex.org/W3086105743","https://openalex.org/W3094502228","https://openalex.org/W3129831491","https://openalex.org/W3132107458","https://openalex.org/W3138516171","https://openalex.org/W3158631574","https://openalex.org/W3159351344","https://openalex.org/W3166243953","https://openalex.org/W3167220634","https://openalex.org/W3189259198","https://openalex.org/W3195577433","https://openalex.org/W3202134982","https://openalex.org/W3205803342","https://openalex.org/W3206832494","https://openalex.org/W3209444800","https://openalex.org/W3210871626","https://openalex.org/W3213938204","https://openalex.org/W4220741164","https://openalex.org/W4221167110","https://openalex.org/W4226057994","https://openalex.org/W4226188310","https://openalex.org/W4287756266","https://openalex.org/W4288089799","https://openalex.org/W4301239768","https://openalex.org/W4308536752","https://openalex.org/W4312681979","https://openalex.org/W6703652217","https://openalex.org/W6739901393","https://openalex.org/W6748645090","https://openalex.org/W6755207826","https://openalex.org/W6756718674","https://openalex.org/W6767997687","https://openalex.org/W6768723914","https://openalex.org/W6769243733","https://openalex.org/W6769275798","https://openalex.org/W6769627184","https://openalex.org/W6774934307","https://openalex.org/W6778729859","https://openalex.org/W6778883912","https://openalex.org/W6779863968","https://openalex.org/W6779999780","https://openalex.org/W6784333009","https://openalex.org/W6787953186","https://openalex.org/W6795846692","https://openalex.org/W6796811805","https://openalex.org/W6797242466","https://openalex.org/W6799372109","https://openalex.org/W6800751262","https://openalex.org/W6803433826","https://openalex.org/W6804052296","https://openalex.org/W6804752602","https://openalex.org/W6810088661","https://openalex.org/W6810296985","https://openalex.org/W6810876399","https://openalex.org/W6811295506","https://openalex.org/W6840553703"],"related_works":["https://openalex.org/W2950520577","https://openalex.org/W1554644772","https://openalex.org/W2494130044","https://openalex.org/W2003935582","https://openalex.org/W2033862586","https://openalex.org/W3209384898","https://openalex.org/W74409296","https://openalex.org/W2158747600","https://openalex.org/W1991844655","https://openalex.org/W1595834484"],"abstract_inverted_index":{"Foundation":[0],"models":[1,228],"are":[2,80],"in":[3],"the":[4,8,22,27,36],"process":[5,38],"of":[6,25,97,196,227],"becoming":[7],"dominant":[9],"deep":[10,117],"learning":[11,118],"technology.":[12],"Pretraining":[13],"a":[14,137,148,165,181],"foundation":[15,154],"model":[16,28,58,62,84,88,133,143,155,203],"is":[17,39,105],"always":[18],"time-consuming":[19],"due":[20],"to":[21,49,64,83,90,151,175,217,239],"large":[23],"scale":[24,152],"both":[26],"parameter":[29],"and":[30,42,60,93,102,140,200,208,233,243],"training":[31,67,119,156,178,220],"dataset.":[32],"Besides":[33],"being":[34],"computing-intensive,":[35],"pretraining":[37],"extremely":[40],"memory-":[41],"communication-intensive.":[43],"These":[44],"challenges":[45],"make":[46],"it":[47],"necessary":[48],"apply":[50],"3D":[51,71,115,128,167,224],"parallelism,":[52,56,59,63],"which":[53,135],"integrates":[54],"data":[55],"pipeline":[57,185],"tensor":[61,202],"achieve":[65],"high":[66,122],"efficiency.":[68],"However,":[69],"current":[70],"parallelism":[72,116,129,204,225],"frameworks":[73,226],"still":[74],"encounter":[75],"two":[76],"issues:":[77],"i)":[78],"they":[79],"not":[81],"transparent":[82],"developers,":[85],"requiring":[86],"manual":[87],"modification":[89],"parallelize":[91],"training,":[92],"ii)":[94],"their":[95],"utilization":[96],"computation":[98,189],"resources,":[99,179],"GPU":[100],"memory,":[101,199],"network":[103],"bandwidth":[104],"insufficient.":[106],"We":[107],"propose":[108],"<italic":[109],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[110],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Merak</i>":[111],",":[112],"an":[113,131],"automated":[114],"framework":[120],"with":[121,130,157,229],"resource":[123],"utilization.":[124],"Merak":[125,145],"automatically":[126],"deploys":[127],"automatic":[132],"partitioner,":[134],"includes":[136],"graph-sharding":[138],"algorithm":[139],"proxy":[141],"node-based":[142],"graph.":[144],"also":[146],"offers":[147],"non-intrusive":[149],"API":[150],"out":[153],"minimal":[158],"code":[159],"modification.":[160],"In":[161],"addition,":[162],"we":[163],"design":[164],"high-performance":[166],"parallel":[168],"runtime":[169],"engine":[170],"that":[171,187,193,205],"employs":[172],"several":[173],"techniques":[174],"exploit":[176],"available":[177],"including":[180],"shifted":[182],"critical":[183],"path":[184],"schedule":[186],"increases":[188],"utilization,":[190],"stage-aware":[191],"recomputation":[192],"makes":[194],"use":[195],"idle":[197],"worker":[198],"sub-pipelined":[201],"overlaps":[206],"communication":[207],"computation.":[209],"Experiments":[210],"on":[211],"64":[212],"GPUs":[213],"demonstrate":[214],"Merak's":[215],"capability":[216],"speed":[218],"up":[219,238],"performance":[221],"over":[222],"state-of-the-art":[223],"1.5,":[230],"2.5,":[231],"8.3,":[232],"20":[234],"billion":[235],"parameters":[236],"by":[237],"1.42,":[240],"1.39,":[241],"1.43,":[242],"1.61\u00d7,":[244],"respectively.":[245]},"counts_by_year":[{"year":2026,"cited_by_count":9},{"year":2025,"cited_by_count":21},{"year":2024,"cited_by_count":16},{"year":2023,"cited_by_count":14}],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2025-10-10T00:00:00"}
