{"id":"https://openalex.org/W4391993458","doi":"https://doi.org/10.1145/3639034","title":"Thorough Characterization and Analysis of Large Transformer Model Training At-Scale","display_name":"Thorough Characterization and Analysis of Large Transformer Model Training At-Scale","publication_year":2024,"publication_date":"2024-02-16","ids":{"openalex":"https://openalex.org/W4391993458","doi":"https://doi.org/10.1145/3639034"},"language":"en","primary_location":{"id":"doi:10.1145/3639034","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3639034","pdf_url":null,"source":{"id":"https://openalex.org/S4210193547","display_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","issn_l":"2476-1249","issn":["2476-1249"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5035702201","display_name":"S. H. Cheng","orcid":"https://orcid.org/0000-0001-9954-7986"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Scott Cheng","raw_affiliation_strings":["Pennsylvania State University, State College, USA"],"raw_orcid":"https://orcid.org/0000-0001-9954-7986","affiliations":[{"raw_affiliation_string":"Pennsylvania State University, State College, USA","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002465188","display_name":"Jun-Liang Lin","orcid":"https://orcid.org/0000-0001-5105-7574"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jun-Liang Lin","raw_affiliation_strings":["Pennsylvania State University, State College, USA"],"raw_orcid":"https://orcid.org/0000-0001-5105-7574","affiliations":[{"raw_affiliation_string":"Pennsylvania State University, State College, USA","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014448094","display_name":"Murali Emani","orcid":"https://orcid.org/0000-0002-6279-0007"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Murali Emani","raw_affiliation_strings":["Argonne National Laboratory, Lemont, USA"],"raw_orcid":"https://orcid.org/0000-0002-6279-0007","affiliations":[{"raw_affiliation_string":"Argonne National Laboratory, Lemont, USA","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039783562","display_name":"Siddhisanket Raskar","orcid":"https://orcid.org/0000-0002-4832-0834"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Siddhisanket Raskar","raw_affiliation_strings":["Argonne National Laboratory, Lemont, USA"],"raw_orcid":"https://orcid.org/0000-0002-4832-0834","affiliations":[{"raw_affiliation_string":"Argonne National Laboratory, Lemont, USA","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028293798","display_name":"Sam Foreman","orcid":"https://orcid.org/0000-0002-9981-0876"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sam Foreman","raw_affiliation_strings":["Argonne National Laboratory, Lemont, USA"],"raw_orcid":"https://orcid.org/0000-0002-9981-0876","affiliations":[{"raw_affiliation_string":"Argonne National Laboratory, Lemont, USA","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037768390","display_name":"Zhen Xie","orcid":"https://orcid.org/0000-0003-3516-2192"},"institutions":[{"id":"https://openalex.org/I123946342","display_name":"Binghamton University","ror":"https://ror.org/008rmbt77","country_code":"US","type":"education","lineage":["https://openalex.org/I123946342"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhen Xie","raw_affiliation_strings":["Binghamton University, Vestal, USA"],"raw_orcid":"https://orcid.org/0000-0003-3516-2192","affiliations":[{"raw_affiliation_string":"Binghamton University, Vestal, USA","institution_ids":["https://openalex.org/I123946342"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075500139","display_name":"Venkatram Vishwanath","orcid":"https://orcid.org/0000-0001-7248-6116"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Venkatram Vishwanath","raw_affiliation_strings":["Argonne National Laboratory, Lemont, USA"],"raw_orcid":"https://orcid.org/0000-0001-7248-6116","affiliations":[{"raw_affiliation_string":"Argonne National Laboratory, Lemont, USA","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5007116603","display_name":"Mahmut Kandemir","orcid":"https://orcid.org/0000-0002-9940-9951"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mahmut Taylan Kandemir","raw_affiliation_strings":["Pennsylvania State University, State College, USA"],"raw_orcid":"https://orcid.org/0000-0002-9940-9951","affiliations":[{"raw_affiliation_string":"Pennsylvania State University, State College, USA","institution_ids":["https://openalex.org/I130769515"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.1755,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.87494251,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"8","issue":"1","first_page":"1","last_page":"25"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9922999739646912,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7889004945755005},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7370433807373047},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.6453100442886353},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.630317211151123},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.6187714338302612},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.568794310092926},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5628580451011658},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.45129889249801636},{"id":"https://openalex.org/keywords/task-parallelism","display_name":"Task parallelism","score":0.4399445056915283},{"id":"https://openalex.org/keywords/performance-metric","display_name":"Performance metric","score":0.4398791790008545},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.36551597714424133},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.3337574601173401},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.08655321598052979},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.06496253609657288}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7889004945755005},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7370433807373047},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.6453100442886353},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.630317211151123},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.6187714338302612},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.568794310092926},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5628580451011658},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.45129889249801636},{"id":"https://openalex.org/C42992933","wikidata":"https://www.wikidata.org/wiki/Q691169","display_name":"Task parallelism","level":3,"score":0.4399445056915283},{"id":"https://openalex.org/C2780898871","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Performance metric","level":2,"score":0.4398791790008545},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.36551597714424133},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3337574601173401},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.08655321598052979},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.06496253609657288},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3639034","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3639034","pdf_url":null,"source":{"id":"https://openalex.org/S4210193547","display_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","issn_l":"2476-1249","issn":["2476-1249"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1313983767","display_name":null,"funder_award_id":"DE-AC02","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G174627229","display_name":null,"funder_award_id":"1931531","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G2777053550","display_name":null,"funder_award_id":"AC02-06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G3075337988","display_name":null,"funder_award_id":"06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G4681045432","display_name":null,"funder_award_id":"2122155","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G498139845","display_name":null,"funder_award_id":"DE-AC02","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G5085543421","display_name":null,"funder_award_id":"AC02-06CH11357","funder_id":"https://openalex.org/F4320338284","funder_display_name":"Argonne National Laboratory"},{"id":"https://openalex.org/G6558272803","display_name":null,"funder_award_id":"DE-AC02","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G6848031779","display_name":null,"funder_award_id":"06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G6918803902","display_name":null,"funder_award_id":"06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G7351994996","display_name":null,"funder_award_id":"DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7917365220","display_name":null,"funder_award_id":"2008398","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7954425250","display_name":null,"funder_award_id":"DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G8143874970","display_name":null,"funder_award_id":"AC02-06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G969889393","display_name":null,"funder_award_id":"DE-AC02-","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320332359","display_name":"Office of Science","ror":"https://ror.org/00mmn6b08"},{"id":"https://openalex.org/F4320338284","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W1585237817","https://openalex.org/W2734941459","https://openalex.org/W2896457183","https://openalex.org/W3038827379","https://openalex.org/W3072623287","https://openalex.org/W3081168214","https://openalex.org/W3129488589","https://openalex.org/W3129831491","https://openalex.org/W3159351344","https://openalex.org/W3175937250","https://openalex.org/W3193985311","https://openalex.org/W3204998121","https://openalex.org/W3205803342","https://openalex.org/W4281758439","https://openalex.org/W4287748555","https://openalex.org/W4294433898","https://openalex.org/W4312933868","https://openalex.org/W4318541692","https://openalex.org/W4386768656"],"related_works":["https://openalex.org/W2950520577","https://openalex.org/W2003935582","https://openalex.org/W74409296","https://openalex.org/W1554644772","https://openalex.org/W2494130044","https://openalex.org/W29548032","https://openalex.org/W2593878938","https://openalex.org/W2468095077","https://openalex.org/W305742777","https://openalex.org/W2121547511"],"abstract_inverted_index":{"Large":[0],"transformer":[1,20,63,187],"models":[2,64],"have":[3],"recently":[4],"achieved":[5],"great":[6],"success":[7],"across":[8],"various":[9,57],"domains.":[10],"With":[11],"a":[12,18,47,73,81,105,113,182],"growing":[13],"number":[14],"of":[15,36,49,62,80,116,141,167,206],"model":[16,21,26,31,38,50,89,155,160,168,188],"parameters,":[17],"large":[19],"training":[22,39,94,117,133,200],"today":[23],"typically":[24],"involves":[25,137],"sharding,":[27],"data":[28,87,142],"parallelism,":[29,143],"and":[30,52,88,121,124,152,173,190,193],"parallelism.":[32],"Thus,":[33],"the":[34,43,78,203],"throughput":[35,118],"large-scale":[37,186],"depends":[40],"heavily":[41],"on":[42,65,96,130,170,185],"network":[44],"bandwidth":[45,175],"since":[46],"combination":[48],"sharding":[51,156],"multiple":[53],"parallelism":[54,90,169],"strategies":[55,157],"incurs":[56],"costs.":[58],"However,":[59],"prior":[60],"characterizations":[61],"high-bandwidth":[66],"DGX":[67],"machines":[68],"that":[69],"use":[70],"TFLOPS":[71],"as":[72],"metric":[74],"may":[75],"not":[76],"reflect":[77],"performance":[79],"system":[82,98,208],"with":[83,149],"lower":[84],"bandwidth.":[85],"Furthermore,":[86],"reveal":[91],"significantly":[92],"distinct":[93],"profiles":[95],"different":[97],"bandwidths":[99],"at":[100],"scale":[101],"and,":[102],"thus,":[103],"need":[104],"thorough":[106],"study.":[107],"In":[108],"this":[109],"paper,":[110],"we":[111],"provide":[112],"bottom-up":[114],"breakdown":[115],"into":[119],"compute":[120],"communication":[122],"time,":[123],"quantitatively":[125],"analyze":[126],"their":[127],"respective":[128],"influences":[129],"overall":[131],"end-to-end":[132],"scaling.":[134],"Our":[135],"evaluation":[136,194],"an":[138],"in-depth":[139],"exploration":[140],"scaling":[144],"up":[145],"to":[146],"512":[147],"GPUs":[148],"limited":[150],"bandwidth,":[151],"examines":[153],"three":[154,165],"among":[158],"six":[159],"sizes.":[161],"We":[162],"also":[163],"evaluate":[164],"combinations":[166],"both":[171],"high":[172],"low":[174],"supercomputing":[176,207],"systems.":[177],"Overall,":[178],"our":[179,191],"work":[180],"provides":[181],"broader":[183],"perspective":[184],"training,":[189],"analysis":[192],"yield":[195],"practical":[196],"insights":[197],"for":[198],"predicting":[199],"scaling,":[201],"shaping":[202],"future":[204],"development":[205],"design.":[209]},"counts_by_year":[{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-14T07:44:22.658603","created_date":"2025-10-10T00:00:00"}
