{"id":"https://openalex.org/W4394698525","doi":"https://doi.org/10.1109/tpami.2024.3386927","title":"DeepNet: Scaling Transformers to 1,000 Layers","display_name":"DeepNet: Scaling Transformers to 1,000 Layers","publication_year":2024,"publication_date":"2024-04-10","ids":{"openalex":"https://openalex.org/W4394698525","doi":"https://doi.org/10.1109/tpami.2024.3386927"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2024.3386927","is_oa":true,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3386927","pdf_url":"https://ieeexplore.ieee.org/ielx7/34/4359286/10496231.pdf","source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://ieeexplore.ieee.org/ielx7/34/4359286/10496231.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083999373","display_name":"Hongyu Wang","orcid":"https://orcid.org/0000-0003-1811-3903"},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hongyu Wang","raw_affiliation_strings":["School of Computer and Control Engineering, University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-1811-3903","affiliations":[{"raw_affiliation_string":"School of Computer and Control Engineering, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113130010","display_name":"Shuming Ma","orcid":"https://orcid.org/0000-0003-1091-1206"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuming Ma","raw_affiliation_strings":["Microsoft Research, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-1091-1206","affiliations":[{"raw_affiliation_string":"Microsoft Research, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101751775","display_name":"Li Dong","orcid":"https://orcid.org/0000-0003-3083-7170"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li Dong","raw_affiliation_strings":["Microsoft Research, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-3083-7170","affiliations":[{"raw_affiliation_string":"Microsoft Research, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061624006","display_name":"Shaohan Huang","orcid":"https://orcid.org/0000-0003-4324-6337"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shaohan Huang","raw_affiliation_strings":["Microsoft Research, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-4324-6337","affiliations":[{"raw_affiliation_string":"Microsoft Research, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100416386","display_name":"Dongdong Zhang","orcid":"https://orcid.org/0000-0003-0833-7903"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongdong Zhang","raw_affiliation_strings":["Microsoft Research, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-0833-7903","affiliations":[{"raw_affiliation_string":"Microsoft Research, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5014662947","display_name":"Furu Wei","orcid":"https://orcid.org/0000-0002-7810-5852"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Furu Wei","raw_affiliation_strings":["Microsoft Research, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-7810-5852","affiliations":[{"raw_affiliation_string":"Microsoft Research, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5083999373"],"corresponding_institution_ids":["https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":9.6889,"has_fulltext":true,"cited_by_count":51,"citation_normalized_percentile":{"value":0.98652897,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":"46","issue":"10","first_page":"6761","last_page":"6774"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6436646580696106},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5792323350906372},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.5786069631576538},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5732437968254089},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.5634434223175049},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5169355273246765},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.4184033274650574},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.4171207547187805},{"id":"https://openalex.org/keywords/approx","display_name":"Approx","score":0.412531316280365},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.18054473400115967},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.17344853281974792},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.10920649766921997},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.08376586437225342},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.07400268316268921}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6436646580696106},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5792323350906372},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.5786069631576538},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5732437968254089},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.5634434223175049},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5169355273246765},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.4184033274650574},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4171207547187805},{"id":"https://openalex.org/C2777894999","wikidata":"https://www.wikidata.org/wiki/Q4781758","display_name":"Approx","level":2,"score":0.412531316280365},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.18054473400115967},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.17344853281974792},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.10920649766921997},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.08376586437225342},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.07400268316268921},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C19165224","wikidata":"https://www.wikidata.org/wiki/Q23404","display_name":"Anthropology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpami.2024.3386927","is_oa":true,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3386927","pdf_url":"https://ieeexplore.ieee.org/ielx7/34/4359286/10496231.pdf","source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1109/tpami.2024.3386927","is_oa":true,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3386927","pdf_url":"https://ieeexplore.ieee.org/ielx7/34/4359286/10496231.pdf","source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4394698525.pdf","grobid_xml":"https://content.openalex.org/works/W4394698525.grobid-xml"},"referenced_works_count":80,"referenced_works":["https://openalex.org/W2117539524","https://openalex.org/W2194775991","https://openalex.org/W2257408573","https://openalex.org/W2507296351","https://openalex.org/W2692059227","https://openalex.org/W2760656271","https://openalex.org/W2884822772","https://openalex.org/W2896457183","https://openalex.org/W2903193068","https://openalex.org/W2923014074","https://openalex.org/W2946609015","https://openalex.org/W2963532001","https://openalex.org/W2963542740","https://openalex.org/W2964085268","https://openalex.org/W2965373594","https://openalex.org/W2970279348","https://openalex.org/W2970290486","https://openalex.org/W2971033911","https://openalex.org/W2979636403","https://openalex.org/W2996908057","https://openalex.org/W3001279689","https://openalex.org/W3010768098","https://openalex.org/W3016010032","https://openalex.org/W3017454464","https://openalex.org/W3035390927","https://openalex.org/W3037492894","https://openalex.org/W3093871477","https://openalex.org/W3103334733","https://openalex.org/W3105425516","https://openalex.org/W3174726724","https://openalex.org/W3175301726","https://openalex.org/W3175746962","https://openalex.org/W3177096435","https://openalex.org/W3177828909","https://openalex.org/W3205328383","https://openalex.org/W4200634402","https://openalex.org/W4221167110","https://openalex.org/W4225591000","https://openalex.org/W4226155321","https://openalex.org/W4287391717","https://openalex.org/W4288089799","https://openalex.org/W4300963525","https://openalex.org/W4301914798","https://openalex.org/W4322718191","https://openalex.org/W4384918448","https://openalex.org/W4385245566","https://openalex.org/W4386071687","https://openalex.org/W6631190155","https://openalex.org/W6631943919","https://openalex.org/W6635469476","https://openalex.org/W6747381837","https://openalex.org/W6752192525","https://openalex.org/W6756718674","https://openalex.org/W6757468910","https://openalex.org/W6763468762","https://openalex.org/W6766673545","https://openalex.org/W6767164110","https://openalex.org/W6768952226","https://openalex.org/W6769627184","https://openalex.org/W6772381481","https://openalex.org/W6772383348","https://openalex.org/W6774776664","https://openalex.org/W6778883912","https://openalex.org/W6780086851","https://openalex.org/W6780805062","https://openalex.org/W6784333009","https://openalex.org/W6784447870","https://openalex.org/W6788811087","https://openalex.org/W6796242362","https://openalex.org/W6796761347","https://openalex.org/W6797610904","https://openalex.org/W6802297474","https://openalex.org/W6805239564","https://openalex.org/W6810220367","https://openalex.org/W6810296985","https://openalex.org/W6810671578","https://openalex.org/W6845401343","https://openalex.org/W6845452229","https://openalex.org/W6850625674","https://openalex.org/W6854866820"],"related_works":["https://openalex.org/W3204184292","https://openalex.org/W3176564347","https://openalex.org/W1985458517","https://openalex.org/W2355833770","https://openalex.org/W4226466875","https://openalex.org/W2977257638","https://openalex.org/W4288095186","https://openalex.org/W3210541621","https://openalex.org/W2755231872","https://openalex.org/W4297807321"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3,16],"propose":[4],"a":[5,18,50,76,135,163],"simple":[6],"yet":[7],"effective":[8],"method":[9,55],"to":[10,27,84],"stabilize":[11],"extremely":[12],"deep":[13,105],"Transformers.":[14,106],"Specifically,":[15],"introduce":[17],"new":[19],"normalization":[20],"function":[21],"(":[22],"<sc":[23,73,111],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[24,74,112,173],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">DeepNorm</small>":[25,75],")":[26],"modify":[28],"the":[29,57,150],"residual":[30],"connection":[31],"in":[32,49],"Transformer,":[33],"accompanying":[34],"with":[35,138,145,154],"theoretically":[36],"derived":[37],"initialization.":[38],"In-depth":[39],"theoretical":[40],"analysis":[41],"shows":[42],"that":[43,110],"model":[44,144,153],"updates":[45],"can":[46],"be":[47],"bounded":[48],"stable":[51,68],"way.":[52],"The":[53],"proposed":[54],"combines":[56],"best":[58],"of":[59,65,70,100],"two":[60],"worlds,":[61],"i.e.,":[62],"good":[63],"performance":[64,116],"Post-LN":[66],"and":[67,90,128],"training":[69],"Pre-LN,":[71],"making":[72],"preferred":[77],"alternative.":[78],"We":[79],"successfully":[80],"scale":[81],"Transformers":[82],"up":[83],"1,000":[85],"layers":[86],"(i.e.,":[87,125,131],"2,500":[88],"attention":[89],"feed-forward":[91],"network":[92],"sublayers)":[93],"without":[94],"difficulty,":[95],"which":[96,161],"is":[97,169],"one":[98],"order":[99],"magnitude":[101],"deeper":[102],"than":[103],"previous":[104],"Extensive":[107],"experiments":[108],"demonstrate":[109],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">DeepNet</small>":[113],"has":[114],"superior":[115],"across":[117],"various":[118],"benchmarks,":[119],"including":[120],"machine":[121],"translation,":[122],"language":[123],"modeling":[124],"BERT,":[126],"GPT)":[127],"vision":[129],"pre-training":[130],"BEiT).":[132],"Remarkably,":[133],"on":[134],"multilingual":[136],"benchmark":[137],"7,482":[139],"translation":[140],"directions,":[141],"our":[142],"200-layer":[143],"3.2B":[146],"parameters":[147,156],"significantly":[148],"outperforms":[149],"48-layer":[151],"state-of-the-art":[152],"12B":[155],"by":[157],"5":[158],"BLEU":[159],"points,":[160],"indicates":[162],"promising":[164],"scaling":[165],"direction.":[166],"Our":[167],"code":[168],"available":[170],"at":[171],"<uri":[172],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">https://aka.ms/torchscale</uri>":[174],".":[175]},"counts_by_year":[{"year":2026,"cited_by_count":8},{"year":2025,"cited_by_count":25},{"year":2024,"cited_by_count":17},{"year":2023,"cited_by_count":1}],"updated_date":"2026-05-15T08:27:34.491423","created_date":"2025-10-10T00:00:00"}
