{"id":"https://openalex.org/W2979419423","doi":"https://doi.org/10.1145/3342351","title":"An Automatic and a Machine-assisted Method to Clean Bilingual Corpus","display_name":"An Automatic and a Machine-assisted Method to Clean Bilingual Corpus","publication_year":2019,"publication_date":"2019-10-09","ids":{"openalex":"https://openalex.org/W2979419423","doi":"https://doi.org/10.1145/3342351","mag":"2979419423"},"language":"en","primary_location":{"id":"doi:10.1145/3342351","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3342351","pdf_url":null,"source":{"id":"https://openalex.org/S4306421405","display_name":"ACM Transactions on Asian and Low-Resource Language Information Processing","issn_l":"2375-4699","issn":["2375-4699","2375-4702"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Asian and Low-Resource Language Information Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014030486","display_name":"Jyoti Srivastava","orcid":"https://orcid.org/0000-0001-8105-4809"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jyoti Srivastava","raw_affiliation_strings":["Madanapalle Institute of Technology 8 Science, Madanapalle, Andhra Pradesh, India"],"affiliations":[{"raw_affiliation_string":"Madanapalle Institute of Technology 8 Science, Madanapalle, Andhra Pradesh, India","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110006428","display_name":"Sudip Sanyal","orcid":null},"institutions":[{"id":"https://openalex.org/I1323093577","display_name":"BML Munjal University","ror":"https://ror.org/058ay3j75","country_code":"IN","type":"education","lineage":["https://openalex.org/I1323093577"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Sudip Sanyal","raw_affiliation_strings":["BML Munjal University, Gurugram, Haryana, India"],"affiliations":[{"raw_affiliation_string":"BML Munjal University, Gurugram, Haryana, India","institution_ids":["https://openalex.org/I1323093577"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5051896692","display_name":"Ashish Kumar Srivastava","orcid":"https://orcid.org/0000-0001-9863-528X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ashish Kumar Srivastava","raw_affiliation_strings":["Madanapalle Institute of Technology 8 Science, Madanapalle, Andhra Pradesh, India"],"affiliations":[{"raw_affiliation_string":"Madanapalle Institute of Technology 8 Science, Madanapalle, Andhra Pradesh, India","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5014030486"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.14,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.57516947,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":93},"biblio":{"volume":"19","issue":"1","first_page":"1","last_page":"19"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.9904999732971191,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8410753607749939},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.7523449659347534},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.725954532623291},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.6831627488136292},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.5889050364494324},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5564482808113098},{"id":"https://openalex.org/keywords/hindi","display_name":"Hindi","score":0.4848114550113678},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.482991099357605},{"id":"https://openalex.org/keywords/corpus-linguistics","display_name":"Corpus linguistics","score":0.47043225169181824},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4635327160358429},{"id":"https://openalex.org/keywords/evaluation-of-machine-translation","display_name":"Evaluation of machine translation","score":0.45356178283691406},{"id":"https://openalex.org/keywords/example-based-machine-translation","display_name":"Example-based machine translation","score":0.20666125416755676},{"id":"https://openalex.org/keywords/machine-translation-software-usability","display_name":"Machine translation software usability","score":0.18420001864433289},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.13584908843040466}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8410753607749939},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7523449659347534},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.725954532623291},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.6831627488136292},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.5889050364494324},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5564482808113098},{"id":"https://openalex.org/C519982507","wikidata":"https://www.wikidata.org/wiki/Q1568","display_name":"Hindi","level":2,"score":0.4848114550113678},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.482991099357605},{"id":"https://openalex.org/C532629269","wikidata":"https://www.wikidata.org/wiki/Q865083","display_name":"Corpus linguistics","level":2,"score":0.47043225169181824},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4635327160358429},{"id":"https://openalex.org/C135784402","wikidata":"https://www.wikidata.org/wiki/Q6958279","display_name":"Evaluation of machine translation","level":5,"score":0.45356178283691406},{"id":"https://openalex.org/C24687705","wikidata":"https://www.wikidata.org/wiki/Q3753284","display_name":"Example-based machine translation","level":3,"score":0.20666125416755676},{"id":"https://openalex.org/C148526163","wikidata":"https://www.wikidata.org/wiki/Q6723733","display_name":"Machine translation software usability","level":4,"score":0.18420001864433289},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.13584908843040466},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3342351","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3342351","pdf_url":null,"source":{"id":"https://openalex.org/S4306421405","display_name":"ACM Transactions on Asian and Low-Resource Language Information Processing","issn_l":"2375-4699","issn":["2375-4699","2375-4702"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Asian and Low-Resource Language Information Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8700000047683716,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":40,"referenced_works":["https://openalex.org/W22168010","https://openalex.org/W128638292","https://openalex.org/W1480519300","https://openalex.org/W1489181569","https://openalex.org/W1519512946","https://openalex.org/W1560797130","https://openalex.org/W1593271688","https://openalex.org/W2006969979","https://openalex.org/W2009847538","https://openalex.org/W2047295649","https://openalex.org/W2065925951","https://openalex.org/W2078861931","https://openalex.org/W2096384330","https://openalex.org/W2096653374","https://openalex.org/W2097333193","https://openalex.org/W2101105183","https://openalex.org/W2107695330","https://openalex.org/W2108459463","https://openalex.org/W2108997961","https://openalex.org/W2111329418","https://openalex.org/W2117652747","https://openalex.org/W2124807415","https://openalex.org/W2129734311","https://openalex.org/W2140030083","https://openalex.org/W2144061901","https://openalex.org/W2156985047","https://openalex.org/W2159872955","https://openalex.org/W2169724380","https://openalex.org/W2251139694","https://openalex.org/W2251206152","https://openalex.org/W2251924162","https://openalex.org/W2330486491","https://openalex.org/W2407892396","https://openalex.org/W2610930722","https://openalex.org/W2615753704","https://openalex.org/W2913048852","https://openalex.org/W3001287559","https://openalex.org/W3141239769","https://openalex.org/W3201817155","https://openalex.org/W4391418904"],"related_works":["https://openalex.org/W3151736118","https://openalex.org/W3097971125","https://openalex.org/W2962780935","https://openalex.org/W2026303297","https://openalex.org/W4387490002","https://openalex.org/W3196981096","https://openalex.org/W2395826315","https://openalex.org/W2169279899","https://openalex.org/W2532807140","https://openalex.org/W2152633844"],"abstract_inverted_index":{"Two":[0,152],"different":[1,153],"methods":[2,215],"of":[3,60,86,116,122,129,174,179,192,195,212],"corpus":[4,98,117,173],"cleaning":[5,35,118],"are":[6,69,78,89,156,164,182],"presented":[7,218],"in":[8,74,189],"this":[9,87,180],"article.":[10],"One":[11,125],"is":[12,17,27,32,44,106,127,139,203,216],"a":[13,82,120,168],"machine-assisted":[14,51],"technique,":[15],"which":[16,31,77],"good":[18],"to":[19,46,71],"clean":[20,112,208],"small-sized":[21,169],"parallel":[22,37,172],"corpus,":[23,76,187],"and":[24,58,63,131,135],"the":[25,61,75,92,111,136,185,190,193,213],"other":[26],"an":[28],"automatic":[29],"method,":[30],"suitable":[33],"for":[34,110,148,158],"large-sized":[36],"corpus.":[38,113,209,221],"A":[39,210],"baseline":[40],"SMT":[41],"(MOSES)":[42],"system":[43],"used":[45,53,70,157],"evaluate":[47],"these":[48,159],"methods.":[49],"The":[50,100,200],"technique":[52],"two":[54,123,160,214],"features:":[55],"word":[56],"alignment":[57,141],"length":[59,128],"source":[62,130],"target":[64,132],"language":[65,133],"sentence.":[66],"These":[67,162],"features":[68],"detect":[72],"mistranslations":[73],"then":[79],"handled":[80],"by":[81,108,144,166,205],"human":[83],"translator.":[84],"Experiments":[85,178],"method":[88,115,181],"conducted":[90,183],"on":[91,184,207,219],"English-to-Indian":[93],"Language":[94],"Machine":[95],"Translation":[96],"(EILMT)":[97],"(English-Hindi).":[99],"Bilingual":[101],"Evaluation":[102],"Understudy":[103],"(BLEU)":[104],"score":[105,142,202],"improved":[107,204],"0.47%":[109],"Automatic":[114],"uses":[119],"combination":[121],"features.":[124,161],"feature":[126,138],"sentence":[134,150,176],"second":[137],"Viterbi":[140],"generated":[143],"Hidden":[145],"Markov":[146],"Model":[147],"each":[149],"pair.":[151],"threshold":[154],"values":[155,163],"decided":[165],"using":[167],"manually":[170],"annotated":[171],"206":[175],"pairs.":[177],"HindEnCorp":[186],"released":[188],"workshop":[191],"Association":[194],"Computational":[196],"Linguistics":[197],"(ACL":[198],"2014).":[199],"BLEU":[201],"0.6%":[206],"comparison":[211],"also":[217],"EILMT":[220]},"counts_by_year":[{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
