{"id":"https://openalex.org/W3142584950","doi":"https://doi.org/10.1007/s10579-021-09536-6","title":"A large English\u2013Thai parallel corpus from the web and machine-generated text","display_name":"A large English\u2013Thai parallel corpus from the web and machine-generated text","publication_year":2021,"publication_date":"2021-03-30","ids":{"openalex":"https://openalex.org/W3142584950","doi":"https://doi.org/10.1007/s10579-021-09536-6","mag":"3142584950"},"language":"en","primary_location":{"id":"doi:10.1007/s10579-021-09536-6","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10579-021-09536-6","pdf_url":null,"source":{"id":"https://openalex.org/S4306424877","display_name":"Language Resources and Evaluation","issn_l":"1574-020X","issn":["1574-020X","1574-0218"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Language Resources and Evaluation","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1007/s10579-021-09536-6","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054771771","display_name":"Lalita Lowphansirikul","orcid":null},"institutions":[{"id":"https://openalex.org/I4210153049","display_name":"Vidyasirimedhi Institute of Science and Technology","ror":"https://ror.org/053jehz60","country_code":"TH","type":"education","lineage":["https://openalex.org/I4210153049"]}],"countries":["TH"],"is_corresponding":false,"raw_author_name":"Lalita Lowphansirikul","raw_affiliation_strings":["School of Information Science and Technology, Vidyasirimedhi Institution of Science and Technology, Rayong, Thailand"],"affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, Vidyasirimedhi Institution of Science and Technology, Rayong, Thailand","institution_ids":["https://openalex.org/I4210153049"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001689161","display_name":"Charin Polpanumas","orcid":"https://orcid.org/0000-0001-7822-4600"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Charin Polpanumas","raw_affiliation_strings":["PyThaiNLP, Bangkok, Thailand"],"affiliations":[{"raw_affiliation_string":"PyThaiNLP, Bangkok, Thailand","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008312239","display_name":"Attapol Rutherford","orcid":"https://orcid.org/0000-0003-2270-6082"},"institutions":[{"id":"https://openalex.org/I158708052","display_name":"Chulalongkorn University","ror":"https://ror.org/028wp3y58","country_code":"TH","type":"education","lineage":["https://openalex.org/I158708052"]}],"countries":["TH"],"is_corresponding":true,"raw_author_name":"Attapol T. Rutherford","raw_affiliation_strings":["Department of Linguistics, Chulalongkorn University, Bangkok, Thailand","Teaching and Learning Thai as a Foreign Language Group, Bangkok, Thailand"],"affiliations":[{"raw_affiliation_string":"Department of Linguistics, Chulalongkorn University, Bangkok, Thailand","institution_ids":["https://openalex.org/I158708052"]},{"raw_affiliation_string":"Teaching and Learning Thai as a Foreign Language Group, Bangkok, Thailand","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5071030042","display_name":"Sarana Nutanong","orcid":"https://orcid.org/0000-0003-1068-850X"},"institutions":[{"id":"https://openalex.org/I4210153049","display_name":"Vidyasirimedhi Institute of Science and Technology","ror":"https://ror.org/053jehz60","country_code":"TH","type":"education","lineage":["https://openalex.org/I4210153049"]}],"countries":["TH"],"is_corresponding":false,"raw_author_name":"Sarana Nutanong","raw_affiliation_strings":["School of Information Science and Technology, Vidyasirimedhi Institution of Science and Technology, Rayong, Thailand"],"affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, Vidyasirimedhi Institution of Science and Technology, Rayong, Thailand","institution_ids":["https://openalex.org/I4210153049"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5008312239"],"corresponding_institution_ids":["https://openalex.org/I158708052"],"apc_list":null,"apc_paid":null,"fwci":3.2568,"has_fulltext":false,"cited_by_count":25,"citation_normalized_percentile":{"value":0.92846098,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":93,"max":99},"biblio":{"volume":"56","issue":"2","first_page":"477","last_page":"499"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9902999997138977,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8546465635299683},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.8038155436515808},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.7248073220252991},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.7071764469146729},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.5866382718086243},{"id":"https://openalex.org/keywords/license","display_name":"License","score":0.5396416783332825},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5281950235366821},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.4548488259315491}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8546465635299683},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.8038155436515808},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7248073220252991},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.7071764469146729},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.5866382718086243},{"id":"https://openalex.org/C2780560020","wikidata":"https://www.wikidata.org/wiki/Q79719","display_name":"License","level":2,"score":0.5396416783332825},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5281950235366821},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4548488259315491},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s10579-021-09536-6","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10579-021-09536-6","pdf_url":null,"source":{"id":"https://openalex.org/S4306424877","display_name":"Language Resources and Evaluation","issn_l":"1574-020X","issn":["1574-020X","1574-0218"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Language Resources and Evaluation","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s10579-021-09536-6","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10579-021-09536-6","pdf_url":null,"source":{"id":"https://openalex.org/S4306424877","display_name":"Language Resources and Evaluation","issn_l":"1574-020X","issn":["1574-020X","1574-0218"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Language Resources and Evaluation","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.6299999952316284,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1808931822","display_name":null,"funder_award_id":"MRG6280175","funder_id":"https://openalex.org/F4320322614","funder_display_name":"Thailand Research Fund"}],"funders":[{"id":"https://openalex.org/F4320321557","display_name":"Chulalongkorn University","ror":"https://ror.org/028wp3y58"},{"id":"https://openalex.org/F4320322614","display_name":"Thailand Research Fund","ror":"https://ror.org/03gd1bh37"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":56,"referenced_works":["https://openalex.org/W22168010","https://openalex.org/W55126966","https://openalex.org/W131533222","https://openalex.org/W630532510","https://openalex.org/W1489181569","https://openalex.org/W1979754320","https://openalex.org/W2005241960","https://openalex.org/W2041532239","https://openalex.org/W2101105183","https://openalex.org/W2133564696","https://openalex.org/W2143017621","https://openalex.org/W2160807445","https://openalex.org/W2186890008","https://openalex.org/W2250342921","https://openalex.org/W2250553775","https://openalex.org/W2251654371","https://openalex.org/W2290964183","https://openalex.org/W2419539795","https://openalex.org/W2496235729","https://openalex.org/W2525778437","https://openalex.org/W2572474373","https://openalex.org/W2613904329","https://openalex.org/W2626778328","https://openalex.org/W2758884106","https://openalex.org/W2794365787","https://openalex.org/W2905927205","https://openalex.org/W2933138175","https://openalex.org/W2949303037","https://openalex.org/W2953320089","https://openalex.org/W2953830716","https://openalex.org/W2963250244","https://openalex.org/W2963506925","https://openalex.org/W2963532001","https://openalex.org/W2963807318","https://openalex.org/W2970686691","https://openalex.org/W2971737394","https://openalex.org/W2973049837","https://openalex.org/W2977458338","https://openalex.org/W2985067290","https://openalex.org/W2986148666","https://openalex.org/W3038033387","https://openalex.org/W3099138433","https://openalex.org/W3208089349","https://openalex.org/W4385245566","https://openalex.org/W4392051461","https://openalex.org/W6629226572","https://openalex.org/W6686649123","https://openalex.org/W6691629350","https://openalex.org/W6723821839","https://openalex.org/W6731744468","https://openalex.org/W6737778391","https://openalex.org/W6739901393","https://openalex.org/W6744844098","https://openalex.org/W6757517122","https://openalex.org/W6893624790","https://openalex.org/W7051469422"],"related_works":["https://openalex.org/W2606446052","https://openalex.org/W2036021480","https://openalex.org/W1619002","https://openalex.org/W2775554247","https://openalex.org/W2110168585","https://openalex.org/W3107474891","https://openalex.org/W2250213760","https://openalex.org/W4386247111","https://openalex.org/W4327642362","https://openalex.org/W2587014613"],"abstract_inverted_index":{"The":[0,126,137],"primary":[1],"objective":[2],"of":[3,88,100],"our":[4,145],"work":[5,146],"is":[6,114,128],"to":[7,84,95,143],"build":[8],"a":[9,53],"large-scale":[10],"English\u2013Thai":[11,23,124],"dataset":[12,26,83,127],"for":[13,61,103,120,130],"training":[14,118],"neural":[15],"machine":[16,24,77],"translation":[17,25,71,78],"models.":[18],"We":[19,57,74],"construct":[20],"scb-mt-en-th-2020,":[21],"an":[22],"with":[27],"over":[28],"1":[29],"million":[30],"segment":[31],"pairs,":[32],"curated":[33],"from":[34],"various":[35],"sources:":[36],"news,":[37],"Wikipedia":[38],"articles,":[39],"SMS":[40],"messages,":[41],"task-based":[42],"dialogs,":[43],"web-crawled":[44],"data,":[45,63],"government":[46],"documents,":[47],"and":[48,66,70,105,123,140],"text":[49],"artificially":[50],"generated":[51],"by":[52],"pretrained":[54],"language":[55],"model.":[56],"present":[58],"the":[59,86,89,109,117],"methods":[60],"gathering":[62],"aligning":[64],"texts,":[65],"removing":[67],"preprocessing":[68],"noise":[69],"errors":[72],"automatically.":[73],"also":[75],"train":[76],"models":[79,92,139],"based":[80],"on":[81],"this":[82],"assess":[85],"quality":[87],"corpus.":[90],"Our":[91],"perform":[93],"comparably":[94],"Google":[96,107],"Translation":[97],"API":[98],"(as":[99],"May":[101],"2020)":[102],"Thai\u2013English":[104,122],"outperform":[106],"when":[108],"Open":[110],"Parallel":[111],"Corpus":[112],"(OPUS)":[113],"included":[115],"in":[116],"data":[119],"both":[121],"translation.":[125],"available":[129,148],"public":[131],"use":[132],"under":[133,149],"CC-BY-SA":[134],"4.0":[135],"License.":[136,151],"pre-trained":[138],"source":[141],"code":[142],"reproduce":[144],"are":[147],"Apache-2.0":[150]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":9},{"year":2022,"cited_by_count":5},{"year":2021,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2021-04-13T00:00:00"}
