{"id":"https://openalex.org/W4280494215","doi":"https://doi.org/10.48550/arxiv.2205.03983","title":"Building Machine Translation Systems for the Next Thousand Languages","display_name":"Building Machine Translation Systems for the Next Thousand Languages","publication_year":2022,"publication_date":"2022-05-09","ids":{"openalex":"https://openalex.org/W4280494215","doi":"https://doi.org/10.48550/arxiv.2205.03983"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2205.03983","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2205.03983","pdf_url":"https://arxiv.org/pdf/2205.03983","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2205.03983","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024316712","display_name":"Ankur Bapna","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bapna, Ankur","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023109845","display_name":"Isaac Caswell","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Caswell, Isaac","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048307591","display_name":"Julia Kreutzer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kreutzer, Julia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035914396","display_name":"Orhan F\u0131rat","orcid":"https://orcid.org/0000-0001-5775-2420"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Firat, Orhan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113902201","display_name":"Daan van Esch","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"van Esch, Daan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021329708","display_name":"Aditya Siddhant","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siddhant, Aditya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112577945","display_name":"Mengmeng Niu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niu, Mengmeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007389429","display_name":"Pallavi Baljekar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Baljekar, Pallavi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082383881","display_name":"Xavier Garc\u00eda","orcid":"https://orcid.org/0000-0002-8500-4224"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Garcia, Xavier","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018646392","display_name":"Wolfgang Macherey","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Macherey, Wolfgang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063381559","display_name":"Theresa Breiner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Breiner, Theresa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090245882","display_name":"Vera Axelrod","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Axelrod, Vera","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015547427","display_name":"Jason Riesa","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Riesa, Jason","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100400922","display_name":"Yuan Cao","orcid":"https://orcid.org/0000-0002-8775-0626"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025244406","display_name":"Mia Xu Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Mia Xu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008084569","display_name":"Klaus Macherey","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Macherey, Klaus","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014055962","display_name":"Maxim Krikun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Krikun, Maxim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053822641","display_name":"Pidong Wang","orcid":"https://orcid.org/0000-0002-3118-8718"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Pidong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029193854","display_name":"Alexander Gutkin","orcid":"https://orcid.org/0000-0001-6327-4824"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gutkin, Alexander","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039974286","display_name":"Apurva S. Shah","orcid":"https://orcid.org/0000-0002-9597-5665"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shah, Apurva","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104102021","display_name":"Yanping Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yanping","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100715425","display_name":"Zhifeng Chen","orcid":"https://orcid.org/0000-0001-8901-1002"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zhifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101011504","display_name":"Yonghui Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yonghui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5067705729","display_name":"Macduff Hughes","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hughes, Macduff","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":24,"corresponding_author_ids":["https://openalex.org/A5024316712"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":43,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9868999719619751,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.9830999970436096,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8436064124107361},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.5571524500846863},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.5471366047859192},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5196731686592102},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5018534660339355},{"id":"https://openalex.org/keywords/complement","display_name":"Complement (music)","score":0.48784637451171875},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.47568073868751526},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.40475717186927795}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8436064124107361},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.5571524500846863},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.5471366047859192},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5196731686592102},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5018534660339355},{"id":"https://openalex.org/C112313634","wikidata":"https://www.wikidata.org/wiki/Q7886648","display_name":"Complement (music)","level":5,"score":0.48784637451171875},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.47568073868751526},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.40475717186927795},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C188082640","wikidata":"https://www.wikidata.org/wiki/Q1780899","display_name":"Complementation","level":4,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C127716648","wikidata":"https://www.wikidata.org/wiki/Q104053","display_name":"Phenotype","level":3,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2205.03983","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2205.03983","pdf_url":"https://arxiv.org/pdf/2205.03983","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2205.03983","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2205.03983","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2205.03983","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2205.03983","pdf_url":"https://arxiv.org/pdf/2205.03983","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3011059803","https://openalex.org/W2382566571","https://openalex.org/W2349321798","https://openalex.org/W2775554247","https://openalex.org/W2110168585","https://openalex.org/W3107474891","https://openalex.org/W2250213760","https://openalex.org/W4386247111","https://openalex.org/W4327642362","https://openalex.org/W2587014613"],"abstract_inverted_index":{"In":[0],"this":[1],"paper":[2],"we":[3],"share":[4],"findings":[5],"from":[6,100],"our":[7,101,117],"effort":[8],"to":[9,122],"build":[10],"practical":[11,53],"machine":[12],"translation":[13],"(MT)":[14],"systems":[15,128],"capable":[16],"of":[17,87,97,109,112,142],"translating":[18],"across":[19],"over":[20,70],"one":[21],"thousand":[22],"languages.":[23],"We":[24,114],"describe":[25],"results":[26],"in":[27,146],"three":[28],"research":[29,135],"domains:":[30],"(i)":[31],"Building":[32],"clean,":[33],"web-mined":[34],"datasets":[35,76],"for":[36,43,56,69,77,90,129],"1500+":[37],"languages":[38,58,73,92],"by":[39,59],"leveraging":[40,60],"semi-supervised":[41],"pre-training":[42],"language":[44],"identification":[45],"and":[46,74,82,93,133],"developing":[47],"data-driven":[48],"filtering":[49],"techniques;":[50],"(ii)":[51],"Developing":[52],"MT":[54,102,127],"models":[55,63,145],"under-served":[57],"massively":[61,143],"multilingual":[62,144],"trained":[64],"with":[65],"supervised":[66],"parallel":[67],"data":[68],"100":[71],"high-resource":[72],"monolingual":[75],"an":[78],"additional":[79],"1000+":[80],"languages;":[81],"(iii)":[83],"Studying":[84],"the":[85,98,140],"limitations":[86],"evaluation":[88],"metrics":[89],"these":[91,110],"conducting":[94],"qualitative":[95],"analysis":[96],"outputs":[99],"models,":[103],"highlighting":[104],"several":[105],"frequent":[106],"error":[107],"modes":[108],"types":[111],"models.":[113],"hope":[115],"that":[116,137],"work":[118],"provides":[119],"useful":[120],"insights":[121],"practitioners":[123],"working":[124],"towards":[125],"building":[126],"currently":[130],"understudied":[131],"languages,":[132],"highlights":[134],"directions":[136],"can":[138],"complement":[139],"weaknesses":[141],"data-sparse":[147],"settings.":[148]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":31},{"year":2022,"cited_by_count":7}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-10T00:00:00"}
