{"id":"https://openalex.org/W6922143542","doi":"https://doi.org/10.1184/r1/21626660.v1","title":"Efficient and Accurate Non-Metric k-NN Search with Applications to Text Matching","display_name":"Efficient and Accurate Non-Metric k-NN Search with Applications to Text Matching","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W6922143542","doi":"https://doi.org/10.1184/r1/21626660.v1"},"language":"en","primary_location":{"id":"pmh:oai:figshare.com:article/21626660","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4377196282","display_name":"Figshare","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210132348","host_organization_name":"Figshare (United Kingdom)","host_organization_lineage":["https://openalex.org/I4210132348"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Text"},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Boytsov, Leonid","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Boytsov, Leonid","raw_affiliation_strings":["Carnegie Mellon University"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.46138743,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T13838","display_name":"Plant and soil sciences","score":0.16200000047683716,"subfield":{"id":"https://openalex.org/subfields/1100","display_name":"General Agricultural and Biological Sciences"},"field":{"id":"https://openalex.org/fields/11","display_name":"Agricultural and Biological Sciences"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T13838","display_name":"Plant and soil sciences","score":0.16200000047683716,"subfield":{"id":"https://openalex.org/subfields/1100","display_name":"General Agricultural and Biological Sciences"},"field":{"id":"https://openalex.org/fields/11","display_name":"Agricultural and Biological Sciences"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T14070","display_name":"Knowledge Societies in the 21st Century","score":0.029100000858306885,"subfield":{"id":"https://openalex.org/subfields/3317","display_name":"Demography"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T14398","display_name":"Agriculture and Social Issues","score":0.0284000001847744,"subfield":{"id":"https://openalex.org/subfields/1110","display_name":"Plant Science"},"field":{"id":"https://openalex.org/fields/11","display_name":"Agricultural and Biological Sciences"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.6166999936103821},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.6021000146865845},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5031999945640564},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4814000129699707},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.4650999903678894},{"id":"https://openalex.org/keywords/nearest-neighbor-search","display_name":"Nearest neighbor search","score":0.42149999737739563},{"id":"https://openalex.org/keywords/learning-to-rank","display_name":"Learning to rank","score":0.3955000042915344},{"id":"https://openalex.org/keywords/term","display_name":"Term (time)","score":0.39430001378059387}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7128000259399414},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.6166999936103821},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.6021000146865845},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5031999945640564},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4814000129699707},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.4650999903678894},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.43059998750686646},{"id":"https://openalex.org/C116738811","wikidata":"https://www.wikidata.org/wiki/Q608751","display_name":"Nearest neighbor search","level":2,"score":0.42149999737739563},{"id":"https://openalex.org/C86037889","wikidata":"https://www.wikidata.org/wiki/Q4330127","display_name":"Learning to rank","level":3,"score":0.3955000042915344},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3952000141143799},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.39430001378059387},{"id":"https://openalex.org/C19889080","wikidata":"https://www.wikidata.org/wiki/Q2835852","display_name":"Beam search","level":3,"score":0.3684000074863434},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35510000586509705},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3465999960899353},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.34380000829696655},{"id":"https://openalex.org/C125583679","wikidata":"https://www.wikidata.org/wiki/Q755673","display_name":"Search algorithm","level":2,"score":0.34310001134872437},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.32249999046325684},{"id":"https://openalex.org/C97854310","wikidata":"https://www.wikidata.org/wiki/Q19541","display_name":"Search engine","level":2,"score":0.3084999918937683},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2890999913215637},{"id":"https://openalex.org/C20228898","wikidata":"https://www.wikidata.org/wiki/Q83540","display_name":"Full text search","level":3,"score":0.2720000147819519},{"id":"https://openalex.org/C201789804","wikidata":"https://www.wikidata.org/wiki/Q2362762","display_name":"Search problem","level":2,"score":0.25699999928474426},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25220000743865967},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:figshare.com:article/21626660","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4377196282","display_name":"Figshare","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210132348","host_organization_name":"Figshare (United Kingdom)","host_organization_lineage":["https://openalex.org/I4210132348"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Text"},{"id":"doi:10.1184/r1/21626660.v1","is_oa":true,"landing_page_url":"https://doi.org/10.1184/r1/21626660.v1","pdf_url":null,"source":{"id":"https://openalex.org/S7407050927","display_name":"KiltHub Repository","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"pmh:oai:figshare.com:article/21626660","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4377196282","display_name":"Figshare","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210132348","host_organization_name":"Figshare (United Kingdom)","host_organization_lineage":["https://openalex.org/I4210132348"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0,125],"this":[1,144,172,262],"thesis":[2,247],"we":[3,128,146],"advance":[4],"state-of-the-art":[5],"of":[6,21,34,66,193,253],"the":[7,100,105,109,191,218,226],"non-metric":[8,94],"k-NN":[9,25,53,140,169,211],"search":[10,54,87,141,170,183,212],"by":[11],"carrying":[12],"out":[13],"an":[14,181,209],"extensive":[15],"empirical":[16],"evaluation":[17],"(both":[18],"and":[19,48,58,95,256],"intrinsic)":[20],"generic":[22,41,139],"methods":[23,88,240],"for":[24,37,62],"search.":[26],"This":[27],"work":[28,91],"contributes":[29],"to":[30,237,261],"establishing":[31],"a":[32,56,63,120,130,138,148,162,195,205,251,270],"collection":[33],"strong":[35],"benchmarks":[36],"data":[38,203],"sets":[39],"with":[40,45,93,116,137,250],"distances.":[42,68,97],"We":[43,178,230,264],"start":[44],"intrinsic":[46],"evaluations":[47,113,118],"demonstrate":[49],"that":[50,151,180,233],"non":[51],"metric":[52,81],"is":[55,175,213,235,248],"practical":[57],"reasonably":[59],"accurate":[60,224],"tool":[61],"wide":[64],"variety":[65],"complex":[67],"However,":[69],"somewhat":[70],"surprisingly,":[71],"achieving":[72],"good":[73],"performance":[74],"does":[75],"not":[76],"require":[77],"distance":[78,84,106],"mapping/proxying":[79],"via":[80],"learning":[82],"or":[83],"symmetrization.":[85],"Existing":[86],"can":[89,185],"often":[90],"directly":[92],"non-symmetric":[96],"They":[98],"outperform":[99],"filter-and-refine":[101],"approach":[102],"relying":[103],"on":[104],"symmetrization":[107],"in":[108,119,198],"filtering":[110],"step.":[111],"Intrinsic":[112],"are":[114,159],"complemented":[115],"extrinsic":[117],"realistic":[121],"text":[122],"retrieval":[123,136,206,271],"task.":[124],"doing":[126],"so,":[127],"make":[129],"step":[131],"towards":[132],"replacing/complementing":[133],"classic":[134],"term-based":[135],"algorithm.":[142],"To":[143],"end":[145],"use":[147],"similarity":[149,173],"function":[150,174],"takes":[152],"into":[153],"account":[154],"subtle":[155],"term":[156],"associations,":[157],"which":[158],"learned":[160,254],"from":[161],"parallel":[163],"monolingual":[164],"corpus.":[165],"An":[166],"exact":[167],"brute-force":[168],"using":[171,208],"quite":[176],"slow.":[177],"show":[179],"approximate":[182,210],"algorithm":[184],"be":[186],"100-300":[187],"times":[188],"faster":[189],"at":[190],"expense":[192],"only":[194],"small":[196],"loss":[197],"accuracy":[199],"(10%).":[200],"On":[201],"one":[202],"set,":[204],"pipeline":[207],"twice":[214],"as":[215,217,223,225],"efficient":[216],"C++":[219],"baseline":[220],"while":[221],"being":[222],"Lucene-based":[227],"fusion":[228],"pipeline.":[229],"note,":[231],"however,":[232],"it":[234],"necessary":[236],"compare":[238],"our":[239],"against":[241],"more":[242],"recent":[243],"ranking":[244],"algorithms.":[245],"The":[246],"concluded":[249],"summary":[252],"lessons":[255],"open":[257],"research":[258],"questions":[259],"(relevant":[260],"work).":[263],"also":[265],"discuss":[266],"potential":[267],"challenges":[268],"facing":[269],"system":[272],"designer.":[273]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
