{"id":"https://openalex.org/W2963542740","doi":"https://doi.org/10.18653/v1/p19-1176","title":"Learning Deep Transformer Models for Machine Translation","display_name":"Learning Deep Transformer Models for Machine Translation","publication_year":2019,"publication_date":"2019-01-01","ids":{"openalex":"https://openalex.org/W2963542740","doi":"https://doi.org/10.18653/v1/p19-1176","mag":"2963542740"},"language":"en","primary_location":{"id":"doi:10.18653/v1/p19-1176","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/p19-1176","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.18653/v1/p19-1176","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025187681","display_name":"Qiang Wang","orcid":"https://orcid.org/0000-0002-9392-475X"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qiang Wang","raw_affiliation_strings":["NLP Lab, Northeastern University, Shenyang, China"],"affiliations":[{"raw_affiliation_string":"NLP Lab, Northeastern University, Shenyang, China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107028150","display_name":"Bei Li","orcid":"https://orcid.org/0000-0001-7617-9041"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bei Li","raw_affiliation_strings":["NLP Lab, Northeastern University, Shenyang, China"],"affiliations":[{"raw_affiliation_string":"NLP Lab, Northeastern University, Shenyang, China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100600701","display_name":"Tong Xiao","orcid":"https://orcid.org/0000-0002-5842-6501"},"institutions":[{"id":"https://openalex.org/I204512498","display_name":"University of Macau","ror":"https://ror.org/01r4q9n85","country_code":"MO","type":"education","lineage":["https://openalex.org/I204512498"]},{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN","MO"],"is_corresponding":false,"raw_author_name":"Tong Xiao","raw_affiliation_strings":["NiuTrans Co., Ltd., Shenyang, China","CT Lab, University of Macau, Macau, China","NLP Lab, Northeastern University, Shenyang, China"],"affiliations":[{"raw_affiliation_string":"NiuTrans Co., Ltd., Shenyang, China","institution_ids":[]},{"raw_affiliation_string":"CT Lab, University of Macau, Macau, China","institution_ids":["https://openalex.org/I204512498"]},{"raw_affiliation_string":"NLP Lab, Northeastern University, Shenyang, China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100370145","display_name":"Jingbo Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]},{"id":"https://openalex.org/I204512498","display_name":"University of Macau","ror":"https://ror.org/01r4q9n85","country_code":"MO","type":"education","lineage":["https://openalex.org/I204512498"]}],"countries":["CN","MO"],"is_corresponding":false,"raw_author_name":"Jingbo Zhu","raw_affiliation_strings":["CT Lab, University of Macau, Macau, China","NiuTrans Co., Ltd., Shenyang, China","NLP Lab, Northeastern University, Shenyang, China"],"affiliations":[{"raw_affiliation_string":"CT Lab, University of Macau, Macau, China","institution_ids":["https://openalex.org/I204512498"]},{"raw_affiliation_string":"NiuTrans Co., Ltd., Shenyang, China","institution_ids":[]},{"raw_affiliation_string":"NLP Lab, Northeastern University, Shenyang, China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101857422","display_name":"Changliang Li","orcid":"https://orcid.org/0000-0003-2236-9266"},"institutions":[{"id":"https://openalex.org/I4210108461","display_name":"Kingsoft (China)","ror":"https://ror.org/01stnfn33","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210108461"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Changliang Li","raw_affiliation_strings":["Kingsoft AI Lab, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Kingsoft AI Lab, Beijing, China","institution_ids":["https://openalex.org/I4210108461"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101468579","display_name":"Derek F. Wong","orcid":"https://orcid.org/0000-0002-5307-7322"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Derek F. Wong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5025832925","display_name":"Lidia S. Chao","orcid":"https://orcid.org/0000-0001-6629-170X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lidia S. Chao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5025187681"],"corresponding_institution_ids":["https://openalex.org/I9224756"],"apc_list":null,"apc_paid":null,"fwci":42.6825,"has_fulltext":false,"cited_by_count":617,"citation_normalized_percentile":{"value":0.99836292,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1810","last_page":"1822"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9869999885559082,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7635256052017212},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.7436230182647705},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.69034343957901},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.6740043759346008},{"id":"https://openalex.org/keywords/nist","display_name":"NIST","score":0.5645008087158203},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5364291071891785},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4620702862739563},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.43853676319122314},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3539128005504608},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.19158542156219482},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.1639593541622162},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.1470191776752472}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7635256052017212},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.7436230182647705},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.69034343957901},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.6740043759346008},{"id":"https://openalex.org/C111219384","wikidata":"https://www.wikidata.org/wiki/Q6954384","display_name":"NIST","level":2,"score":0.5645008087158203},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5364291071891785},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4620702862739563},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.43853676319122314},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3539128005504608},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.19158542156219482},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.1639593541622162},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.1470191776752472},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C19165224","wikidata":"https://www.wikidata.org/wiki/Q23404","display_name":"Anthropology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/p19-1176","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/p19-1176","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/p19-1176","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/p19-1176","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6700000166893005,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1543750907","https://openalex.org/W1597944220","https://openalex.org/W1815076433","https://openalex.org/W1902237438","https://openalex.org/W2113104171","https://openalex.org/W2128892113","https://openalex.org/W2130942839","https://openalex.org/W2133564696","https://openalex.org/W2194775991","https://openalex.org/W2302255633","https://openalex.org/W2525778437","https://openalex.org/W2594990650","https://openalex.org/W2767008699","https://openalex.org/W2798761464","https://openalex.org/W2817535134","https://openalex.org/W2888520903","https://openalex.org/W2890964657","https://openalex.org/W2896060389","https://openalex.org/W2896457183","https://openalex.org/W2902081112","https://openalex.org/W2962739339","https://openalex.org/W2962784628","https://openalex.org/W2962931466","https://openalex.org/W2963088785","https://openalex.org/W2963212250","https://openalex.org/W2963216553","https://openalex.org/W2963302407","https://openalex.org/W2963341956","https://openalex.org/W2963403868","https://openalex.org/W2963418779","https://openalex.org/W2963599677","https://openalex.org/W2963636855","https://openalex.org/W2963755523","https://openalex.org/W2963807318","https://openalex.org/W2963925437","https://openalex.org/W2963991316","https://openalex.org/W2964045208","https://openalex.org/W2964088127","https://openalex.org/W2964121744","https://openalex.org/W2964308564","https://openalex.org/W4297747548","https://openalex.org/W4300831640","https://openalex.org/W4385245566"],"related_works":["https://openalex.org/W3176018525","https://openalex.org/W3026554633","https://openalex.org/W2903810591","https://openalex.org/W4289548192","https://openalex.org/W2888520903","https://openalex.org/W2963499882","https://openalex.org/W2903399267","https://openalex.org/W2949454572","https://openalex.org/W3098873988","https://openalex.org/W2952599318"],"abstract_inverted_index":{"Transformer":[0,40,74],"is":[1,133],"the":[2,22,32,39,43,51,61,66,78,95,101,117,130],"state-of-the-art":[3],"model":[4,75,132],"in":[5,136,141],"recent":[6],"machine":[7],"translation":[8],"evaluations.":[9],"Two":[10],"strands":[11],"of":[12,19,38,63,85,93,97],"research":[13,64],"are":[14],"promising":[15],"to":[16,100],"improve":[17],"models":[18],"this":[20],"kind:":[21],"first":[23],"uses":[24,45],"wide":[25],"networks":[26],"(a.k.a.":[27],"Transformer-Big)":[28],"and":[29,42,88,106,138],"has":[30],"been":[31],"de":[33],"facto":[34],"standard":[35],"for":[36],"development":[37],"system,":[41],"other":[44],"deeper":[46],"language":[47],"representation":[48],"but":[49],"faces":[50],"difficulty":[52],"arising":[53],"from":[54],"learning":[55],"deep":[56,73,112,131],"networks.":[57],"Here,":[58],"we":[59],"continue":[60],"line":[62],"on":[65],"latter.":[67],"We":[68],"claim":[69],"that":[70],"a":[71,90],"truly":[72],"can":[76],"surpass":[77],"Transformer-Big":[79],"counterpart":[80],"by":[81,123],"1)":[82],"proper":[83],"use":[84],"layer":[86],"normalization":[87],"2)":[89],"novel":[91],"way":[92],"passing":[94],"combination":[96],"previous":[98],"layers":[99],"next.":[102],"On":[103],"WMT\u201916":[104],"English-German":[105],"NIST":[107],"OpenMT\u201912":[108],"Chinese-English":[109],"tasks,":[110],"our":[111],"system":[113],"(30/25-layer":[114],"encoder)":[115,122],"outperforms":[116],"shallow":[118],"Transformer-Big/Base":[119],"baseline":[120],"(6-layer":[121],"0.4-2.4":[124],"BLEU":[125],"points.":[126],"As":[127],"another":[128],"bonus,":[129],"1.6X":[134],"smaller":[135],"size":[137],"3X":[139],"faster":[140],"training":[142],"than":[143],"Transformer-Big.":[144]},"counts_by_year":[{"year":2026,"cited_by_count":7},{"year":2025,"cited_by_count":78},{"year":2024,"cited_by_count":89},{"year":2023,"cited_by_count":148},{"year":2022,"cited_by_count":86},{"year":2021,"cited_by_count":110},{"year":2020,"cited_by_count":91},{"year":2019,"cited_by_count":8}],"updated_date":"2026-03-27T14:29:43.386196","created_date":"2025-10-10T00:00:00"}
