{"id":"https://openalex.org/W3137010024","doi":"https://doi.org/10.1162/tacl_a_00447","title":"Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets","display_name":"Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W3137010024","doi":"https://doi.org/10.1162/tacl_a_00447","mag":"3137010024"},"language":"en","primary_location":{"id":"doi:10.1162/tacl_a_00447","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00447","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00447/1986585/tacl_a_00447.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00447/1986585/tacl_a_00447.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048307591","display_name":"Julia Kreutzer","orcid":null},"institutions":[{"id":"https://openalex.org/I4210148186","display_name":"Google (Canada)","ror":"https://ror.org/04d06q394","country_code":"CA","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969","https://openalex.org/I4210148186"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Julia Kreutzer","raw_affiliation_strings":["Google Research, Canada","Masakhane NLP, USA"],"affiliations":[{"raw_affiliation_string":"Google Research, Canada","institution_ids":["https://openalex.org/I4210148186"]},{"raw_affiliation_string":"Masakhane NLP, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023109845","display_name":"Isaac Caswell","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Isaac Caswell","raw_affiliation_strings":["Google Research, USA"],"affiliations":[{"raw_affiliation_string":"Google Research, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102886118","display_name":"Lisa Wang","orcid":"https://orcid.org/0000-0003-3477-5755"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lisa Wang","raw_affiliation_strings":["Google Research, Germany","Google Research, USA"],"affiliations":[{"raw_affiliation_string":"Google Research, Germany","institution_ids":[]},{"raw_affiliation_string":"Google Research, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048170743","display_name":"Ahsan Wahab","orcid":null},"institutions":[{"id":"https://openalex.org/I2613432","display_name":"University of South Florida","ror":"https://ror.org/032db5x82","country_code":"US","type":"education","lineage":["https://openalex.org/I2613432"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ahsan Wahab","raw_affiliation_strings":["Turkic Interlingua","University of South Florida, USA"],"affiliations":[{"raw_affiliation_string":"Turkic Interlingua","institution_ids":[]},{"raw_affiliation_string":"University of South Florida, USA","institution_ids":["https://openalex.org/I2613432"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113902201","display_name":"Daan van Esch","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daan van Esch","raw_affiliation_strings":["Google Research, The Netherlands"],"affiliations":[{"raw_affiliation_string":"Google Research, The Netherlands","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016076646","display_name":"Nasanbayar Ulzii-Orshikh","orcid":null},"institutions":[{"id":"https://openalex.org/I155707491","display_name":"Haverford College","ror":"https://ror.org/04fnrxr62","country_code":"US","type":"education","lineage":["https://openalex.org/I155707491"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nasanbayar Ulzii-Orshikh","raw_affiliation_strings":["Haverford College, USA"],"affiliations":[{"raw_affiliation_string":"Haverford College, USA","institution_ids":["https://openalex.org/I155707491"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075701052","display_name":"Allahsera Auguste Tapo","orcid":null},"institutions":[{"id":"https://openalex.org/I4210119482","display_name":"Mali-Folkecenter","ror":"https://ror.org/02gp6p739","country_code":"ML","type":"other","lineage":["https://openalex.org/I4210119482"]}],"countries":["ML"],"is_corresponding":false,"raw_author_name":"Allahsera Tapo","raw_affiliation_strings":["Masakhane NLP, Mali","RobotsMali, Mali"],"affiliations":[{"raw_affiliation_string":"Masakhane NLP, Mali","institution_ids":["https://openalex.org/I4210119482"]},{"raw_affiliation_string":"RobotsMali, Mali","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109481132","display_name":"Nishant Subramani","orcid":null},"institutions":[{"id":"https://openalex.org/I4210156221","display_name":"Allen Institute for Artificial Intelligence","ror":"https://ror.org/05w520734","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210156221"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nishant Subramani","raw_affiliation_strings":["Allen Institute for Artificial Intelligence, USA","Masakhane NLP, USA"],"affiliations":[{"raw_affiliation_string":"Allen Institute for Artificial Intelligence, USA","institution_ids":["https://openalex.org/I4210156221"]},{"raw_affiliation_string":"Masakhane NLP, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081885843","display_name":"Artem Sokolov","orcid":"https://orcid.org/0000-0002-8056-0504"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Artem Sokolov","raw_affiliation_strings":["Google Research, Germany"],"affiliations":[{"raw_affiliation_string":"Google Research, Germany","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039333739","display_name":"Claytone Sikasote","orcid":"https://orcid.org/0009-0004-1372-4662"},"institutions":[{"id":"https://openalex.org/I33278361","display_name":"University of Zambia","ror":"https://ror.org/03gh19d69","country_code":"ZM","type":"education","lineage":["https://openalex.org/I33278361"]}],"countries":["ZM"],"is_corresponding":false,"raw_author_name":"Claytone Sikasote","raw_affiliation_strings":["Masakhane NLP, Zambia","University of Zambia, Zambia"],"affiliations":[{"raw_affiliation_string":"Masakhane NLP, Zambia","institution_ids":[]},{"raw_affiliation_string":"University of Zambia, Zambia","institution_ids":["https://openalex.org/I33278361"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050704181","display_name":"Monang Setyawan","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Monang Setyawan","raw_affiliation_strings":["Google, USA"],"affiliations":[{"raw_affiliation_string":"Google, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021167657","display_name":"Supheakmungkol Sarin","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Supheakmungkol Sarin","raw_affiliation_strings":["Google, USA"],"affiliations":[{"raw_affiliation_string":"Google, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015715837","display_name":"Sokhar Samb","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sokhar Samb","raw_affiliation_strings":["AIMS-AMMI, Senegal","Masakhane NLP, Senegal"],"affiliations":[{"raw_affiliation_string":"AIMS-AMMI, Senegal","institution_ids":[]},{"raw_affiliation_string":"Masakhane NLP, Senegal","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077663332","display_name":"Beno\u00eet Sagot","orcid":"https://orcid.org/0000-0002-0107-8526"},"institutions":[{"id":"https://openalex.org/I1326498283","display_name":"Institut national de recherche en informatique et en automatique","ror":"https://ror.org/02kvxyf05","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1326498283"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Beno\u00eet Sagot","raw_affiliation_strings":["Inria, France"],"affiliations":[{"raw_affiliation_string":"Inria, France","institution_ids":["https://openalex.org/I1326498283"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024731885","display_name":"Clara E. Rivera","orcid":null},"institutions":[{"id":"https://openalex.org/I4210113297","display_name":"Google (United Kingdom)","ror":"https://ror.org/024bc3e07","country_code":"GB","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210113297","https://openalex.org/I4210128969"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Clara Rivera","raw_affiliation_strings":["Google Research, UK"],"affiliations":[{"raw_affiliation_string":"Google Research, UK","institution_ids":["https://openalex.org/I4210113297"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051694298","display_name":"Annette Rios","orcid":"https://orcid.org/0000-0002-8943-3472"},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Annette Rios","raw_affiliation_strings":["University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023621106","display_name":"Isabel Papadimitriou","orcid":null},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Isabel Papadimitriou","raw_affiliation_strings":["Stanford University, USA"],"affiliations":[{"raw_affiliation_string":"Stanford University, USA","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010053988","display_name":"Salomey Osei","orcid":"https://orcid.org/0000-0003-1900-3124"},"institutions":[{"id":"https://openalex.org/I28046988","display_name":"Kwame Nkrumah University of Science and Technology","ror":"https://ror.org/00cb23x68","country_code":"GH","type":"education","lineage":["https://openalex.org/I28046988"]}],"countries":["GH"],"is_corresponding":false,"raw_author_name":"Salomey Osei","raw_affiliation_strings":["Kwame Nkrumah University of Science and Technology, Ghana","Masakhane NLP, Ghana"],"affiliations":[{"raw_affiliation_string":"Kwame Nkrumah University of Science and Technology, Ghana","institution_ids":["https://openalex.org/I28046988"]},{"raw_affiliation_string":"Masakhane NLP, Ghana","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054913492","display_name":"Pedro Ortiz Su\u00e1rez","orcid":"https://orcid.org/0000-0003-0343-8852"},"institutions":[{"id":"https://openalex.org/I39804081","display_name":"Sorbonne Universit\u00e9","ror":"https://ror.org/02en5vm52","country_code":"FR","type":"education","lineage":["https://openalex.org/I39804081"]},{"id":"https://openalex.org/I1326498283","display_name":"Institut national de recherche en informatique et en automatique","ror":"https://ror.org/02kvxyf05","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1326498283"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Pedro Ortiz Suarez","raw_affiliation_strings":["Inria, France","Sorbonne Universit\u00e9, France"],"affiliations":[{"raw_affiliation_string":"Inria, France","institution_ids":["https://openalex.org/I1326498283"]},{"raw_affiliation_string":"Sorbonne Universit\u00e9, France","institution_ids":["https://openalex.org/I39804081"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016069022","display_name":"Iroro Orife","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Iroro Orife","raw_affiliation_strings":["Masakhane NLP, USA","Niger-Volta LTI, USA"],"affiliations":[{"raw_affiliation_string":"Masakhane NLP, USA","institution_ids":[]},{"raw_affiliation_string":"Niger-Volta LTI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088147173","display_name":"Kelechi Ogueji","orcid":"https://orcid.org/0009-0004-7919-9303"},"institutions":[{"id":"https://openalex.org/I151746483","display_name":"University of Waterloo","ror":"https://ror.org/01aff2v68","country_code":"CA","type":"education","lineage":["https://openalex.org/I151746483"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Kelechi Ogueji","raw_affiliation_strings":["Masakhane NLP, USA","University of Waterloo, Canada"],"affiliations":[{"raw_affiliation_string":"Masakhane NLP, USA","institution_ids":[]},{"raw_affiliation_string":"University of Waterloo, Canada","institution_ids":["https://openalex.org/I151746483"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024826347","display_name":"Andre Niyongabo Rubungo","orcid":"https://orcid.org/0000-0003-3608-2039"},"institutions":[{"id":"https://openalex.org/I9617848","display_name":"Universitat Polit\u00e8cnica de Catalunya","ror":"https://ror.org/03mb6wj31","country_code":"ES","type":"education","lineage":["https://openalex.org/I9617848"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Andre Niyongabo Rubungo","raw_affiliation_strings":["Masakhane NLP, Spain","Universitat Polit\u00e8cnica de Catalunya, Spain"],"affiliations":[{"raw_affiliation_string":"Masakhane NLP, Spain","institution_ids":[]},{"raw_affiliation_string":"Universitat Polit\u00e8cnica de Catalunya, Spain","institution_ids":["https://openalex.org/I9617848"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102874067","display_name":"Toan Nguyen","orcid":"https://orcid.org/0000-0003-2734-0622"},"institutions":[{"id":"https://openalex.org/I107639228","display_name":"University of Notre Dame","ror":"https://ror.org/00mkhxb43","country_code":"US","type":"education","lineage":["https://openalex.org/I107639228"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Toan Q. Nguyen","raw_affiliation_strings":["University of Notre Dame, USA"],"affiliations":[{"raw_affiliation_string":"University of Notre Dame, USA","institution_ids":["https://openalex.org/I107639228"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010842007","display_name":"Mathias M\u00fcller","orcid":"https://orcid.org/0000-0002-7879-3552"},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Mathias M\u00fcller","raw_affiliation_strings":["University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004652975","display_name":"Andr\u00e9 M\u00fcller","orcid":"https://orcid.org/0000-0001-5770-6723"},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Andr\u00e9 M\u00fcller","raw_affiliation_strings":["University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002849148","display_name":"Shamsuddeen Hassan Muhammad","orcid":"https://orcid.org/0000-0001-7708-0799"},"institutions":[{"id":"https://openalex.org/I919958821","display_name":"Bayero University Kano","ror":"https://ror.org/049pzty39","country_code":"NG","type":"education","lineage":["https://openalex.org/I919958821"]}],"countries":["NG"],"is_corresponding":false,"raw_author_name":"Shamsuddeen Hassan Muhammad","raw_affiliation_strings":["Bayero University Kano, Nigeria","Masakhane NLP, Nigeria"],"affiliations":[{"raw_affiliation_string":"Bayero University Kano, Nigeria","institution_ids":["https://openalex.org/I919958821"]},{"raw_affiliation_string":"Masakhane NLP, Nigeria","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108643603","display_name":"N Sakib Muhammad","orcid":null},"institutions":[{"id":"https://openalex.org/I919958821","display_name":"Bayero University Kano","ror":"https://ror.org/049pzty39","country_code":"NG","type":"education","lineage":["https://openalex.org/I919958821"]}],"countries":["NG"],"is_corresponding":false,"raw_author_name":"Nanda Muhammad","raw_affiliation_strings":["Bayero University Kano, Nigeria"],"affiliations":[{"raw_affiliation_string":"Bayero University Kano, Nigeria","institution_ids":["https://openalex.org/I919958821"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001036606","display_name":"Ayanda Mnyakeni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ayanda Mnyakeni","raw_affiliation_strings":["Google, South Africa"],"affiliations":[{"raw_affiliation_string":"Google, South Africa","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046323511","display_name":"Jamshidbek Mirzakhalov","orcid":"https://orcid.org/0009-0006-7117-0834"},"institutions":[{"id":"https://openalex.org/I2613432","display_name":"University of South Florida","ror":"https://ror.org/032db5x82","country_code":"US","type":"education","lineage":["https://openalex.org/I2613432"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jamshidbek Mirzakhalov","raw_affiliation_strings":["Turkic Interlingua","University of South Florida, USA"],"affiliations":[{"raw_affiliation_string":"Turkic Interlingua","institution_ids":[]},{"raw_affiliation_string":"University of South Florida, USA","institution_ids":["https://openalex.org/I2613432"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009776906","display_name":"Tapiwanashe Matangira","orcid":null},"institutions":[{"id":"https://openalex.org/I4210148186","display_name":"Google (Canada)","ror":"https://ror.org/04d06q394","country_code":"CA","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969","https://openalex.org/I4210148186"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Tapiwanashe Matangira","raw_affiliation_strings":["Google, Canada"],"affiliations":[{"raw_affiliation_string":"Google, Canada","institution_ids":["https://openalex.org/I4210148186"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078178614","display_name":"Colin Leong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Colin Leong","raw_affiliation_strings":["Masakhane NLP, USA"],"affiliations":[{"raw_affiliation_string":"Masakhane NLP, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061040214","display_name":"Nze Lawson","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nze Lawson","raw_affiliation_strings":["Google, USA"],"affiliations":[{"raw_affiliation_string":"Google, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050484501","display_name":"Sneha Kudugunta","orcid":"https://orcid.org/0000-0002-0186-2433"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sneha Kudugunta","raw_affiliation_strings":["Google Research, USA"],"affiliations":[{"raw_affiliation_string":"Google Research, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000126238","display_name":"Yacine Jernite","orcid":"https://orcid.org/0000-0002-8053-6862"},"institutions":[{"id":"https://openalex.org/I4387154989","display_name":"Hugging Face","ror":"https://ror.org/02grspc61","country_code":null,"type":"company","lineage":["https://openalex.org/I4387154989"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yacine Jernite","raw_affiliation_strings":["Hugging Face, USA","Masakhane NLP, USA"],"affiliations":[{"raw_affiliation_string":"Hugging Face, USA","institution_ids":["https://openalex.org/I4387154989"]},{"raw_affiliation_string":"Masakhane NLP, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032898501","display_name":"Mathias Jenny","orcid":"https://orcid.org/0000-0001-8919-1170"},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Mathias Jenny","raw_affiliation_strings":["University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035914396","display_name":"Orhan F\u0131rat","orcid":"https://orcid.org/0000-0001-5775-2420"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Orhan Firat","raw_affiliation_strings":["Google Research, USA","Turkic Interlingua"],"affiliations":[{"raw_affiliation_string":"Google Research, USA","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Turkic Interlingua","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058219704","display_name":"Bonaventure F. P. Dossou","orcid":"https://orcid.org/0000-0002-0519-1761"},"institutions":[{"id":"https://openalex.org/I193619901","display_name":"Constructor University","ror":"https://ror.org/02yrs2n53","country_code":"DE","type":"education","lineage":["https://openalex.org/I193619901"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Bonaventure F. P. Dossou","raw_affiliation_strings":["Jacobs University Bremen, Germany","Masakhane NLP, Germany"],"affiliations":[{"raw_affiliation_string":"Jacobs University Bremen, Germany","institution_ids":["https://openalex.org/I193619901"]},{"raw_affiliation_string":"Masakhane NLP, Germany","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052516855","display_name":"Sakhile Dlamini","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sakhile Dlamini","raw_affiliation_strings":["Google, USA"],"affiliations":[{"raw_affiliation_string":"Google, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060065532","display_name":"Nisansa de Silva","orcid":"https://orcid.org/0000-0002-5361-4810"},"institutions":[{"id":"https://openalex.org/I195740183","display_name":"University of Moratuwa","ror":"https://ror.org/0491f5305","country_code":"LK","type":"education","lineage":["https://openalex.org/I195740183"]}],"countries":["LK"],"is_corresponding":false,"raw_author_name":"Nisansa de Silva","raw_affiliation_strings":["University of Moratuwa, Sri Lanka"],"affiliations":[{"raw_affiliation_string":"University of Moratuwa, Sri Lanka","institution_ids":["https://openalex.org/I195740183"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081603870","display_name":"\u015eevket Ball\u0131","orcid":"https://orcid.org/0000-0001-6600-6315"},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Sakine \u00c7abuk Ball\u0131","raw_affiliation_strings":["University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076733732","display_name":"Stella Biderman","orcid":"https://orcid.org/0000-0001-8228-1042"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stella Biderman","raw_affiliation_strings":["EleutherAI, USA"],"affiliations":[{"raw_affiliation_string":"EleutherAI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015180851","display_name":"Alessia Battisti","orcid":null},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Alessia Battisti","raw_affiliation_strings":["University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033633587","display_name":"Ahmed Baruwa","orcid":null},"institutions":[{"id":"https://openalex.org/I45711476","display_name":"Obafemi Awolowo University","ror":"https://ror.org/04snhqa82","country_code":"NG","type":"education","lineage":["https://openalex.org/I45711476"]}],"countries":["NG"],"is_corresponding":false,"raw_author_name":"Ahmed Baruwa","raw_affiliation_strings":["Masakhane NLP, USA","Obafemi Awolowo University, Nigeria"],"affiliations":[{"raw_affiliation_string":"Masakhane NLP, USA","institution_ids":[]},{"raw_affiliation_string":"Obafemi Awolowo University, Nigeria","institution_ids":["https://openalex.org/I45711476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024316712","display_name":"Ankur Bapna","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ankur Bapna","raw_affiliation_strings":["Google Research, USA"],"affiliations":[{"raw_affiliation_string":"Google Research, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007389429","display_name":"Pallavi Baljekar","orcid":null},"institutions":[{"id":"https://openalex.org/I4210148186","display_name":"Google (Canada)","ror":"https://ror.org/04d06q394","country_code":"CA","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969","https://openalex.org/I4210148186"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Pallavi Baljekar","raw_affiliation_strings":["Google Research, Canada"],"affiliations":[{"raw_affiliation_string":"Google Research, Canada","institution_ids":["https://openalex.org/I4210148186"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013873474","display_name":"Israel Abebe Azime","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114632","display_name":"Ambo University","ror":"https://ror.org/02e6z0y17","country_code":"ET","type":"education","lineage":["https://openalex.org/I4210114632"]}],"countries":["ET"],"is_corresponding":false,"raw_author_name":"Israel Abebe Azime","raw_affiliation_strings":["AIMS-AMMI, Ethiopia","Masakhane NLP, Ethiopia"],"affiliations":[{"raw_affiliation_string":"AIMS-AMMI, Ethiopia","institution_ids":["https://openalex.org/I4210114632"]},{"raw_affiliation_string":"Masakhane NLP, Ethiopia","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089329850","display_name":"Ayodele Awokoya","orcid":null},"institutions":[{"id":"https://openalex.org/I181631907","display_name":"University of Ibadan","ror":"https://ror.org/03wx2rr30","country_code":"NG","type":"education","lineage":["https://openalex.org/I181631907"]}],"countries":["NG"],"is_corresponding":false,"raw_author_name":"Ayodele Awokoya","raw_affiliation_strings":["Masakhane NLP, Nigeria","University of Ibadan, Nigeria"],"affiliations":[{"raw_affiliation_string":"Masakhane NLP, Nigeria","institution_ids":[]},{"raw_affiliation_string":"University of Ibadan, Nigeria","institution_ids":["https://openalex.org/I181631907"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086132343","display_name":"Duygu Ataman","orcid":null},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Duygu Ataman","raw_affiliation_strings":["Turkic Interlingua, Switzerland","University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"Turkic Interlingua, Switzerland","institution_ids":[]},{"raw_affiliation_string":"University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045507345","display_name":"Orevaoghene Ahia","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Orevaoghene Ahia","raw_affiliation_strings":["Google, USA","Instadeep, Nigeria","Masakhane NLP, USA"],"affiliations":[{"raw_affiliation_string":"Google, USA","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Instadeep, Nigeria","institution_ids":[]},{"raw_affiliation_string":"Masakhane NLP, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060627852","display_name":"Oghenefego Ahia","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Oghenefego Ahia","raw_affiliation_strings":["Google, USA","Instadeep, Nigeria","Masakhane NLP, USA"],"affiliations":[{"raw_affiliation_string":"Google, USA","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Instadeep, Nigeria","institution_ids":[]},{"raw_affiliation_string":"Masakhane NLP, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103140610","display_name":"Sweta Agrawal","orcid":"https://orcid.org/0000-0001-5692-0816"},"institutions":[{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sweta Agrawal","raw_affiliation_strings":["University of Maryland, USA"],"affiliations":[{"raw_affiliation_string":"University of Maryland, USA","institution_ids":["https://openalex.org/I66946132"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5076302977","display_name":"Mofetoluwa Adeyemi","orcid":"https://orcid.org/0009-0003-2859-7136"},"institutions":[{"id":"https://openalex.org/I4210087579","display_name":"National Space Research and Development Agency","ror":"https://ror.org/005epk420","country_code":"NG","type":"government","lineage":["https://openalex.org/I2801477186","https://openalex.org/I4210087579","https://openalex.org/I4210122850"]}],"countries":["NG"],"is_corresponding":false,"raw_author_name":"Mofetoluwa Adeyemi","raw_affiliation_strings":["Defence Space Administration Abuja, Nigeria","Masakhane NLP, Nigeria"],"affiliations":[{"raw_affiliation_string":"Defence Space Administration Abuja, Nigeria","institution_ids":["https://openalex.org/I4210087579"]},{"raw_affiliation_string":"Masakhane NLP, Nigeria","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":13,"institutions_distinct_count":52,"corresponding_author_ids":["https://openalex.org/A5048307591"],"corresponding_institution_ids":["https://openalex.org/I4210148186"],"apc_list":null,"apc_paid":null,"fwci":19.5954,"has_fulltext":false,"cited_by_count":163,"citation_normalized_percentile":{"value":0.99509376,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":100},"biblio":{"volume":"10","issue":null,"first_page":"50","last_page":"72"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.881841778755188},{"id":"https://openalex.org/keywords/usable","display_name":"USable","score":0.7601500749588013},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.6352940201759338},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.611190915107727},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5530468225479126},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.536851704120636},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3965432047843933},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.32910382747650146},{"id":"https://openalex.org/keywords/accounting","display_name":"Accounting","score":0.07864481210708618}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.881841778755188},{"id":"https://openalex.org/C2780615836","wikidata":"https://www.wikidata.org/wiki/Q2471869","display_name":"USable","level":2,"score":0.7601500749588013},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.6352940201759338},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.611190915107727},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5530468225479126},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.536851704120636},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3965432047843933},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.32910382747650146},{"id":"https://openalex.org/C121955636","wikidata":"https://www.wikidata.org/wiki/Q4116214","display_name":"Accounting","level":1,"score":0.07864481210708618},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1162/tacl_a_00447","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00447","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00447/1986585/tacl_a_00447.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2103.12028","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2103.12028","pdf_url":"https://arxiv.org/pdf/2103.12028","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:HAL:hal-03177623v1","is_oa":true,"landing_page_url":"https://inria.hal.science/hal-03177623","pdf_url":null,"source":{"id":"https://openalex.org/S4306402512","display_name":"HAL (Le Centre pour la Communication Scientifique Directe)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1294671590","host_organization_name":"Centre National de la Recherche Scientifique","host_organization_lineage":["https://openalex.org/I1294671590"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Transactions of the Association for Computational Linguistics, 2022, 10, pp.50-72. &#x27E8;10.1162/tacl_a_00447&#x27E9;","raw_type":"Journal articles"},{"id":"pmh:oai:doaj.org/article:183709a1b69d42379e0bf5e2c5ba1497","is_oa":true,"landing_page_url":"https://doaj.org/article/183709a1b69d42379e0bf5e2c5ba1497","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Transactions of the Association for Computational Linguistics, Vol 10 (2023)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1162/tacl_a_00447","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00447","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00447/1986585/tacl_a_00447.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.7900000214576721,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3137010024.pdf","grobid_xml":"https://content.openalex.org/works/W3137010024.grobid-xml"},"referenced_works_count":105,"referenced_works":["https://openalex.org/W37508832","https://openalex.org/W1554540371","https://openalex.org/W1583837637","https://openalex.org/W1905522558","https://openalex.org/W2081011876","https://openalex.org/W2117278770","https://openalex.org/W2138679451","https://openalex.org/W2199580741","https://openalex.org/W2252046065","https://openalex.org/W2563351168","https://openalex.org/W2607303097","https://openalex.org/W2750779823","https://openalex.org/W2771976988","https://openalex.org/W2773493195","https://openalex.org/W2795038878","https://openalex.org/W2802642435","https://openalex.org/W2805411799","https://openalex.org/W2891555348","https://openalex.org/W2911227954","https://openalex.org/W2948902769","https://openalex.org/W2949303037","https://openalex.org/W2954265451","https://openalex.org/W2958953787","https://openalex.org/W2962890089","https://openalex.org/W2963281280","https://openalex.org/W2963341956","https://openalex.org/W2963424553","https://openalex.org/W2963626623","https://openalex.org/W2963667932","https://openalex.org/W2963919854","https://openalex.org/W2964085268","https://openalex.org/W2970529093","https://openalex.org/W2973088264","https://openalex.org/W2977458338","https://openalex.org/W2986154550","https://openalex.org/W2987103574","https://openalex.org/W3013840636","https://openalex.org/W3034617741","https://openalex.org/W3035016936","https://openalex.org/W3035390927","https://openalex.org/W3035579820","https://openalex.org/W3080175947","https://openalex.org/W3082274269","https://openalex.org/W3093871477","https://openalex.org/W3094737233","https://openalex.org/W3095845993","https://openalex.org/W3098749165","https://openalex.org/W3098903812","https://openalex.org/W3098998028","https://openalex.org/W3099919888","https://openalex.org/W3101860695","https://openalex.org/W3102046836","https://openalex.org/W3103187652","https://openalex.org/W3105220303","https://openalex.org/W3105425516","https://openalex.org/W3114757058","https://openalex.org/W3114950584","https://openalex.org/W3115778530","https://openalex.org/W3118781290","https://openalex.org/W3119746452","https://openalex.org/W3119872155","https://openalex.org/W3127069001","https://openalex.org/W3133702157","https://openalex.org/W3152788712","https://openalex.org/W3154147337","https://openalex.org/W3156216837","https://openalex.org/W3156703103","https://openalex.org/W3169369929","https://openalex.org/W3169483174","https://openalex.org/W3169705981","https://openalex.org/W3174269049","https://openalex.org/W4212774754","https://openalex.org/W4288089799","https://openalex.org/W4292779060","https://openalex.org/W4294152847","https://openalex.org/W4297801177","https://openalex.org/W4297823766","https://openalex.org/W4298338448","https://openalex.org/W4300963525","https://openalex.org/W4308264370","https://openalex.org/W4391156274","https://openalex.org/W4404783772","https://openalex.org/W6601523051","https://openalex.org/W6639825529","https://openalex.org/W6677328538","https://openalex.org/W6691972233","https://openalex.org/W6731031554","https://openalex.org/W6748304040","https://openalex.org/W6749630143","https://openalex.org/W6751678649","https://openalex.org/W6755207826","https://openalex.org/W6765469073","https://openalex.org/W6769627184","https://openalex.org/W6778883912","https://openalex.org/W6782866269","https://openalex.org/W6784308239","https://openalex.org/W6784447870","https://openalex.org/W6784577980","https://openalex.org/W6788175385","https://openalex.org/W6788798556","https://openalex.org/W6789924496","https://openalex.org/W6790130950","https://openalex.org/W6793818051","https://openalex.org/W6794128349","https://openalex.org/W6845401343"],"related_works":["https://openalex.org/W2982321410","https://openalex.org/W2392004567","https://openalex.org/W2046296964","https://openalex.org/W2940029036","https://openalex.org/W4388292429","https://openalex.org/W2756595502","https://openalex.org/W2010789764","https://openalex.org/W2187233292","https://openalex.org/W2219281195","https://openalex.org/W4389422031"],"abstract_inverted_index":{"Abstract":[0],"With":[1],"the":[2,33,100],"success":[3],"of":[4,21,28,35,72],"large-scale":[5],"pre-training":[6],"and":[7,63,98,112,116],"multilingual":[8,114],"modeling":[9],"in":[10],"Natural":[11],"Language":[12],"Processing":[13],"(NLP),":[14],"recent":[15],"years":[16],"have":[17,52,59],"seen":[18],"a":[19,64],"proliferation":[20],"large,":[22],"Web-mined":[23],"text":[24],"datasets":[25,44],"covering":[26],"hundreds":[27],"languages.":[29],"We":[30,85],"manually":[31],"audit":[32,102],"quality":[34],"205":[36],"language-specific":[37],"corpora":[38,51,58,115],"released":[39],"with":[40,103,122],"five":[41],"major":[42],"public":[43],"(CCAligned,":[45],"ParaCrawl,":[46],"WikiMatrix,":[47],"OSCAR,":[48],"mC4).":[49],"Lower-resource":[50],"systematic":[53],"issues:":[54],"At":[55],"least":[56],"15":[57],"no":[60],"usable":[61],"text,":[62],"significant":[65],"fraction":[66],"contains":[67],"less":[68],"than":[69],"50%":[70],"sentences":[71],"acceptable":[73],"quality.":[74],"In":[75],"addition,":[76],"many":[77],"are":[78,90],"mislabeled":[79],"or":[80],"use":[81],"nonstandard/ambiguous":[82],"language":[83],"codes.":[84],"demonstrate":[86],"that":[87,120],"these":[88],"issues":[89],"easy":[91],"to":[92,110],"detect":[93],"even":[94],"for":[95],"non-proficient":[96],"speakers,":[97],"supplement":[99],"human":[101],"automatic":[104],"analyses.":[105],"Finally,":[106],"we":[107],"recommend":[108],"techniques":[109],"evaluate":[111],"improve":[113],"discuss":[117],"potential":[118],"risks":[119],"come":[121],"low-quality":[123],"data":[124],"releases.":[125]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":22},{"year":2024,"cited_by_count":24},{"year":2023,"cited_by_count":56},{"year":2022,"cited_by_count":40},{"year":2021,"cited_by_count":16},{"year":2020,"cited_by_count":1}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
