{"id":"https://openalex.org/W2511852057","doi":"https://doi.org/10.18653/v1/w16-2365","title":"Quick and Reliable Document Alignment via TF/IDF-weighted Cosine Distance","display_name":"Quick and Reliable Document Alignment via TF/IDF-weighted Cosine Distance","publication_year":2016,"publication_date":"2016-01-01","ids":{"openalex":"https://openalex.org/W2511852057","doi":"https://doi.org/10.18653/v1/w16-2365","mag":"2511852057"},"language":"en","primary_location":{"id":"doi:10.18653/v1/w16-2365","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/w16-2365","pdf_url":"https://www.aclweb.org/anthology/W16-2365.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the First Conference on Machine Translation: Volume 2,\n          Shared Task Papers","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.aclweb.org/anthology/W16-2365.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5005095034","display_name":"Christian Buck","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Christian Buck","raw_affiliation_strings":["University of Edinburgh Edinburgh, Scotland"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh Edinburgh, Scotland","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112315093","display_name":"Philipp Koehn","orcid":null},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Philipp Koehn","raw_affiliation_strings":["Center for Language and Speech Processing Department of Computer Science Johns Hopkins University, Baltimore, MD"],"affiliations":[{"raw_affiliation_string":"Center for Language and Speech Processing Department of Computer Science Johns Hopkins University, Baltimore, MD","institution_ids":["https://openalex.org/I145311948"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5005095034"],"corresponding_institution_ids":["https://openalex.org/I98677209"],"apc_list":null,"apc_paid":null,"fwci":4.2847,"has_fulltext":false,"cited_by_count":31,"citation_normalized_percentile":{"value":0.94902131,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"672","last_page":"678"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7558468580245972},{"id":"https://openalex.org/keywords/bipartite-graph","display_name":"Bipartite graph","score":0.7451773285865784},{"id":"https://openalex.org/keywords/cosine-similarity","display_name":"Cosine similarity","score":0.7261664271354675},{"id":"https://openalex.org/keywords/tf\u2013idf","display_name":"tf\u2013idf","score":0.5842382907867432},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5509209632873535},{"id":"https://openalex.org/keywords/trigonometric-functions","display_name":"Trigonometric functions","score":0.48660361766815186},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4651898443698883},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.42725250124931335},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4212248921394348},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.39099597930908203},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3863089084625244},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.37787219882011414},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.37176787853240967},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3469444215297699},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3266165852546692},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.2601625323295593},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1406831443309784}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7558468580245972},{"id":"https://openalex.org/C197657726","wikidata":"https://www.wikidata.org/wiki/Q174733","display_name":"Bipartite graph","level":3,"score":0.7451773285865784},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.7261664271354675},{"id":"https://openalex.org/C81758059","wikidata":"https://www.wikidata.org/wiki/Q796584","display_name":"tf\u2013idf","level":3,"score":0.5842382907867432},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5509209632873535},{"id":"https://openalex.org/C178009071","wikidata":"https://www.wikidata.org/wiki/Q93344","display_name":"Trigonometric functions","level":2,"score":0.48660361766815186},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4651898443698883},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.42725250124931335},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4212248921394348},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.39099597930908203},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3863089084625244},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.37787219882011414},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.37176787853240967},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3469444215297699},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3266165852546692},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2601625323295593},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1406831443309784},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.18653/v1/w16-2365","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/w16-2365","pdf_url":"https://www.aclweb.org/anthology/W16-2365.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the First Conference on Machine Translation: Volume 2,\n          Shared Task Papers","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.ed.ac.uk:publications/36527572-3b88-4049-9358-aea55248ef74","is_oa":true,"landing_page_url":"http://hdl.handle.net/20.500.11820/36527572-3b88-4049-9358-aea55248ef74","pdf_url":null,"source":{"id":"https://openalex.org/S4306400321","display_name":"Edinburgh Research Explorer (University of Edinburgh)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I98677209","host_organization_name":"University of Edinburgh","host_organization_lineage":["https://openalex.org/I98677209"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":""}],"best_oa_location":{"id":"doi:10.18653/v1/w16-2365","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/w16-2365","pdf_url":"https://www.aclweb.org/anthology/W16-2365.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the First Conference on Machine Translation: Volume 2,\n          Shared Task Papers","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2511852057.pdf","grobid_xml":"https://content.openalex.org/works/W2511852057.grobid-xml"},"referenced_works_count":9,"referenced_works":["https://openalex.org/W1532325895","https://openalex.org/W1981053739","https://openalex.org/W2047295649","https://openalex.org/W2101096097","https://openalex.org/W2141461755","https://openalex.org/W2145080939","https://openalex.org/W2145251161","https://openalex.org/W2170716095","https://openalex.org/W2406591195"],"related_works":["https://openalex.org/W4390349266","https://openalex.org/W4391020859","https://openalex.org/W126212742","https://openalex.org/W3170122200","https://openalex.org/W3008856892","https://openalex.org/W1948687848","https://openalex.org/W2953417386","https://openalex.org/W2595951309","https://openalex.org/W3109760095","https://openalex.org/W2473504045"],"abstract_inverted_index":{"This":[0],"work":[1],"describes":[2],"our":[3],"submission":[4],"to":[5,33,62],"the":[6,42,59],"WMT16":[7],"Bilingual":[8],"Document":[9],"Alignment":[10],"task.We":[11],"show":[12],"that":[13,50],"a":[14,28,47],"very":[15],"simple":[16],"distance":[17,21],"metric,":[18],"namely":[19],"Cosine":[20],"of":[22],"tf/idf":[23],"weighted":[24],"document":[25,43],"vectors":[26],"provides":[27],"quick":[29],"and":[30,53,70],"reliable":[31],"way":[32],"align":[34],"documents.We":[35],"compare":[36],"many":[37],"possible":[38],"variants":[39],"for":[40],"constructing":[41],"vectors.We":[44],"also":[45],"introduce":[46],"greedy":[48],"algorithm":[49],"runs":[51],"quicker":[52],"performs":[54],"better":[55],"in":[56],"practice":[57],"than":[58],"optimal":[60],"solution":[61],"bipartite":[63],"graph":[64],"matching.Our":[65],"approach":[66],"shows":[67],"competitive":[68],"performance":[69],"can":[71],"be":[72],"improved":[73],"even":[74],"further":[75],"through":[76],"combination":[77],"with":[78],"URL":[79],"based":[80],"pair":[81],"matching.":[82]},"counts_by_year":[{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":9},{"year":2019,"cited_by_count":7},{"year":2018,"cited_by_count":2},{"year":2016,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
