{"id":"https://openalex.org/W2903297715","doi":"https://doi.org/10.18653/v1/w18-6481","title":"Accurate semantic textual similarity for cleaning noisy parallel corpora using semantic machine translation evaluation metric: The NRC supervised submissions to the Parallel Corpus Filtering task","display_name":"Accurate semantic textual similarity for cleaning noisy parallel corpora using semantic machine translation evaluation metric: The NRC supervised submissions to the Parallel Corpus Filtering task","publication_year":2018,"publication_date":"2018-01-01","ids":{"openalex":"https://openalex.org/W2903297715","doi":"https://doi.org/10.18653/v1/w18-6481","mag":"2903297715"},"language":"en","primary_location":{"id":"doi:10.18653/v1/w18-6481","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/w18-6481","pdf_url":"https://www.aclweb.org/anthology/W18-6481.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Third Conference on Machine Translation: Shared Task Papers","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.aclweb.org/anthology/W18-6481.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049492975","display_name":"Chi-kiu Lo","orcid":"https://orcid.org/0000-0001-8714-7846"},"institutions":[{"id":"https://openalex.org/I4210159778","display_name":"National Research Council Canada","ror":"https://ror.org/04mte1k06","country_code":"CA","type":"government","lineage":["https://openalex.org/I4210159778"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Chi-kiu Lo","raw_affiliation_strings":["NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada"],"affiliations":[{"raw_affiliation_string":"NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada","institution_ids":["https://openalex.org/I4210159778"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111588711","display_name":"Michel Simard","orcid":null},"institutions":[{"id":"https://openalex.org/I4210159778","display_name":"National Research Council Canada","ror":"https://ror.org/04mte1k06","country_code":"CA","type":"government","lineage":["https://openalex.org/I4210159778"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Michel Simard","raw_affiliation_strings":["NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada"],"affiliations":[{"raw_affiliation_string":"NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada","institution_ids":["https://openalex.org/I4210159778"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103359343","display_name":"Darlene Stewart","orcid":null},"institutions":[{"id":"https://openalex.org/I4210159778","display_name":"National Research Council Canada","ror":"https://ror.org/04mte1k06","country_code":"CA","type":"government","lineage":["https://openalex.org/I4210159778"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Darlene Stewart","raw_affiliation_strings":["NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada"],"affiliations":[{"raw_affiliation_string":"NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada","institution_ids":["https://openalex.org/I4210159778"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084782140","display_name":"Samuel Larkin","orcid":null},"institutions":[{"id":"https://openalex.org/I4210159778","display_name":"National Research Council Canada","ror":"https://ror.org/04mte1k06","country_code":"CA","type":"government","lineage":["https://openalex.org/I4210159778"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Samuel Larkin","raw_affiliation_strings":["NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada"],"affiliations":[{"raw_affiliation_string":"NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada","institution_ids":["https://openalex.org/I4210159778"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065257553","display_name":"Cyril Goutte","orcid":"https://orcid.org/0000-0003-4939-6555"},"institutions":[{"id":"https://openalex.org/I4210159778","display_name":"National Research Council Canada","ror":"https://ror.org/04mte1k06","country_code":"CA","type":"government","lineage":["https://openalex.org/I4210159778"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Cyril Goutte","raw_affiliation_strings":["NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada"],"affiliations":[{"raw_affiliation_string":"NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada","institution_ids":["https://openalex.org/I4210159778"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5045031910","display_name":"Patrick Littell","orcid":"https://orcid.org/0000-0002-7173-0225"},"institutions":[{"id":"https://openalex.org/I4210159778","display_name":"National Research Council Canada","ror":"https://ror.org/04mte1k06","country_code":"CA","type":"government","lineage":["https://openalex.org/I4210159778"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Patrick Littell","raw_affiliation_strings":["NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada"],"affiliations":[{"raw_affiliation_string":"NRC-CNRC Multilingual Text Processing National Research Council Canada 1200 Montreal Road, Ottawa, ON K1A 0R6, Canada","institution_ids":["https://openalex.org/I4210159778"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5049492975"],"corresponding_institution_ids":["https://openalex.org/I4210159778"],"apc_list":null,"apc_paid":null,"fwci":4.2213,"has_fulltext":true,"cited_by_count":35,"citation_normalized_percentile":{"value":0.95333258,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"908","last_page":"916"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.851783037185669},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.7020167708396912},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.662790060043335},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.6592967510223389},{"id":"https://openalex.org/keywords/semantic-similarity","display_name":"Semantic similarity","score":0.5771386027336121},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.5624755024909973},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5124569535255432},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.5069544315338135},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.48536980152130127},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4552350342273712},{"id":"https://openalex.org/keywords/fluency","display_name":"Fluency","score":0.44111713767051697},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.42571550607681274}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.851783037185669},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.7020167708396912},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.662790060043335},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.6592967510223389},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.5771386027336121},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.5624755024909973},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5124569535255432},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.5069544315338135},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.48536980152130127},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4552350342273712},{"id":"https://openalex.org/C2777413886","wikidata":"https://www.wikidata.org/wiki/Q3276013","display_name":"Fluency","level":2,"score":0.44111713767051697},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.42571550607681274},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.18653/v1/w18-6481","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/w18-6481","pdf_url":"https://www.aclweb.org/anthology/W18-6481.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Third Conference on Machine Translation: Shared Task Papers","raw_type":"proceedings-article"},{"id":"pmh:oai:cisti-icist.nrc-cnrc.ca:cistinparc:c7a0017b-bde5-4154-90be-93d0a454b094","is_oa":true,"landing_page_url":"https://nrc-publications.canada.ca/eng/view/ft/?id=c7a0017b-bde5-4154-90be-93d0a454b094","pdf_url":"https://nrc-publications.canada.ca/eng/view/ft/?id=c7a0017b-bde5-4154-90be-93d0a454b094","source":{"id":"https://openalex.org/S7407055245","display_name":"NPARC","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.18653/v1/w18-6481","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/w18-6481","pdf_url":"https://www.aclweb.org/anthology/W18-6481.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Third Conference on Machine Translation: Shared Task Papers","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7200000286102295}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2903297715.pdf","grobid_xml":"https://content.openalex.org/works/W2903297715.grobid-xml"},"referenced_works_count":52,"referenced_works":["https://openalex.org/W193080678","https://openalex.org/W932413789","https://openalex.org/W1560797130","https://openalex.org/W1631260214","https://openalex.org/W1834000468","https://openalex.org/W2006969979","https://openalex.org/W2038698865","https://openalex.org/W2047295649","https://openalex.org/W2087735403","https://openalex.org/W2101105183","https://openalex.org/W2108459463","https://openalex.org/W2109943925","https://openalex.org/W2115081467","https://openalex.org/W2117278770","https://openalex.org/W2123318312","https://openalex.org/W2132529109","https://openalex.org/W2134800885","https://openalex.org/W2141440284","https://openalex.org/W2148708890","https://openalex.org/W2153579005","https://openalex.org/W2153653739","https://openalex.org/W2159107349","https://openalex.org/W2159755860","https://openalex.org/W2161227214","https://openalex.org/W2180952760","https://openalex.org/W2250907725","https://openalex.org/W2251453970","https://openalex.org/W2251682575","https://openalex.org/W2251765408","https://openalex.org/W2257408573","https://openalex.org/W2270190199","https://openalex.org/W2399188371","https://openalex.org/W2471177583","https://openalex.org/W2508117065","https://openalex.org/W2523478734","https://openalex.org/W2605035112","https://openalex.org/W2760738985","https://openalex.org/W2778814079","https://openalex.org/W2895810819","https://openalex.org/W2902918014","https://openalex.org/W2903182367","https://openalex.org/W2963403868","https://openalex.org/W2963506925","https://openalex.org/W2963602293","https://openalex.org/W2963919854","https://openalex.org/W3082674894","https://openalex.org/W3166956191","https://openalex.org/W3186536529","https://openalex.org/W3208011254","https://openalex.org/W4241645538","https://openalex.org/W4294170691","https://openalex.org/W4385245566"],"related_works":["https://openalex.org/W2365169615","https://openalex.org/W1970538215","https://openalex.org/W2400151637","https://openalex.org/W2975827637","https://openalex.org/W2354089692","https://openalex.org/W595497825","https://openalex.org/W4236323843","https://openalex.org/W2002616876","https://openalex.org/W3174418441","https://openalex.org/W4309298167"],"abstract_inverted_index":{"We":[0],"present":[1],"our":[2,61,118],"semantic":[3,18,104],"textual":[4],"similarity":[5],"approach":[6,30],"in":[7,33,43,48,75,121],"filtering":[8,85],"a":[9,97,124],"noisy":[10,125],"web":[11],"crawled":[12],"parallel":[13,126],"corpus":[14,95,128],"using":[15],"YiSi-a":[16],"novel":[17],"machine":[19],"translation":[20],"evaluation":[21],"metric.":[22],"The":[23],"systems":[24,80],"mainly":[25],"based":[26],"on":[27],"this":[28,113],"supervised":[29],"perform":[31],"well":[32],"the":[34,68,90,93,109,131],"WMT18":[35],"Parallel":[36],"Corpus":[37],"Filtering":[38],"shared":[39],"task":[40],"(4th":[41],"place":[42,47,53],"100-millionword":[44],"evaluation,":[45,50],"8th":[46],"10-million-word":[49],"and":[51,96,105,137],"6th":[52],"overall,":[54],"out":[55],"of":[56,67,92,108],"48":[57],"submissions).":[58],"In":[59,112],"fact,":[60],"best":[62],"performing":[63],"system-NRC-yisi-bicov":[64],"is":[65],"one":[66],"only":[69],"four":[70],"submissions":[71],"ranked":[72],"top":[73],"10":[74],"both":[76],"evaluations.":[77],"Our":[78],"submitted":[79],"also":[81,116],"include":[82],"some":[83],"initial":[84],"steps":[86],"for":[87,102,129],"scaling":[88],"down":[89],"size":[91],"test":[94],"final":[98],"redundancy":[99],"removal":[100],"step":[101],"better":[103],"token":[106],"coverage":[107],"filtered":[110],"corpus.":[111],"paper,":[114],"we":[115],"describe":[117],"unsuccessful":[119],"attempt":[120],"automatically":[122],"synthesizing":[123],"development":[127],"tuning":[130],"weights":[132],"to":[133],"combine":[134],"different":[135],"parallelism":[136],"fluency":[138],"features.":[139]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":9},{"year":2019,"cited_by_count":9},{"year":2018,"cited_by_count":3}],"updated_date":"2026-03-08T08:50:53.379069","created_date":"2025-10-10T00:00:00"}
