{"id":"https://openalex.org/W1974995373","doi":"https://doi.org/10.1109/icde.2008.4497435","title":"Fast Indexes and Algorithms for Set Similarity Selection Queries","display_name":"Fast Indexes and Algorithms for Set Similarity Selection Queries","publication_year":2008,"publication_date":"2008-04-01","ids":{"openalex":"https://openalex.org/W1974995373","doi":"https://doi.org/10.1109/icde.2008.4497435","mag":"1974995373"},"language":"en","primary_location":{"id":"doi:10.1109/icde.2008.4497435","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icde.2008.4497435","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2008 IEEE 24th International Conference on Data Engineering","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058831095","display_name":"Marios Hadjieleftheriou","orcid":null},"institutions":[{"id":"https://openalex.org/I1283103587","display_name":"AT&T (United States)","ror":"https://ror.org/02bbd5539","country_code":"US","type":"company","lineage":["https://openalex.org/I1283103587"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Marios Hadjieleftheriou","raw_affiliation_strings":["AT and T Research Laboratories, Florham Park, NJ, USA","AT&T Labs\u2014\u2014Research, Florham Park, NJ#TAB#"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AT and T Research Laboratories, Florham Park, NJ, USA","institution_ids":["https://openalex.org/I1283103587"]},{"raw_affiliation_string":"AT&T Labs\u2014\u2014Research, Florham Park, NJ#TAB#","institution_ids":["https://openalex.org/I1283103587"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110823710","display_name":"Amit Chandel","orcid":null},"institutions":[{"id":"https://openalex.org/I1283103587","display_name":"AT&T (United States)","ror":"https://ror.org/02bbd5539","country_code":"US","type":"company","lineage":["https://openalex.org/I1283103587"]},{"id":"https://openalex.org/I185261750","display_name":"University of Toronto","ror":"https://ror.org/03dbr7087","country_code":"CA","type":"education","lineage":["https://openalex.org/I185261750"]}],"countries":["CA","US"],"is_corresponding":false,"raw_author_name":"Amit Chandel","raw_affiliation_strings":["Department of Computer Science, University of Toronto, Toronto, ONT, Canada","AT&T Labs\u2014\u2014Research, Florham Park, NJ#TAB#"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Toronto, Toronto, ONT, Canada","institution_ids":["https://openalex.org/I185261750"]},{"raw_affiliation_string":"AT&T Labs\u2014\u2014Research, Florham Park, NJ#TAB#","institution_ids":["https://openalex.org/I1283103587"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035257754","display_name":"Nick Koudas","orcid":"https://orcid.org/0000-0001-5648-0638"},"institutions":[{"id":"https://openalex.org/I185261750","display_name":"University of Toronto","ror":"https://ror.org/03dbr7087","country_code":"CA","type":"education","lineage":["https://openalex.org/I185261750"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Nick Koudas","raw_affiliation_strings":["Department of Computer Science, University of Toronto, Toronto, ONT, Canada","[Department of Computer Science, University of Toronto, Toronto, ON M5S 2E4, Canada. koudas@cs.toronto.edu]"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Toronto, Toronto, ONT, Canada","institution_ids":["https://openalex.org/I185261750"]},{"raw_affiliation_string":"[Department of Computer Science, University of Toronto, Toronto, ON M5S 2E4, Canada. koudas@cs.toronto.edu]","institution_ids":["https://openalex.org/I185261750"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088315797","display_name":"Divesh Srivastava","orcid":"https://orcid.org/0000-0002-7609-9217"},"institutions":[{"id":"https://openalex.org/I1283103587","display_name":"AT&T (United States)","ror":"https://ror.org/02bbd5539","country_code":"US","type":"company","lineage":["https://openalex.org/I1283103587"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Divesh Srivastava","raw_affiliation_strings":["AT and T Research Laboratories, Florham Park, NJ, USA","AT&TLabs-Research, Florham Park, NJ 07932 USA. divesh@research.att.com"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AT and T Research Laboratories, Florham Park, NJ, USA","institution_ids":["https://openalex.org/I1283103587"]},{"raw_affiliation_string":"AT&TLabs-Research, Florham Park, NJ 07932 USA. divesh@research.att.com","institution_ids":["https://openalex.org/I1283103587"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":20.0862,"has_fulltext":false,"cited_by_count":101,"citation_normalized_percentile":{"value":0.99512804,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"267","last_page":"276"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9921000003814697,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7784785032272339},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.7101793885231018},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6466502547264099},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.6094576716423035},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5623576641082764},{"id":"https://openalex.org/keywords/semantic-similarity","display_name":"Semantic similarity","score":0.5523847937583923},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5464001893997192},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.48965007066726685},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.45359936356544495},{"id":"https://openalex.org/keywords/relational-database","display_name":"Relational database","score":0.4142720699310303},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.41406887769699097},{"id":"https://openalex.org/keywords/nearest-neighbor-search","display_name":"Nearest neighbor search","score":0.41360968351364136},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3331880569458008},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2450695037841797},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.14631855487823486}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7784785032272339},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.7101793885231018},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6466502547264099},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.6094576716423035},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5623576641082764},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.5523847937583923},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5464001893997192},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.48965007066726685},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.45359936356544495},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.4142720699310303},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.41406887769699097},{"id":"https://openalex.org/C116738811","wikidata":"https://www.wikidata.org/wiki/Q608751","display_name":"Nearest neighbor search","level":2,"score":0.41360968351364136},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3331880569458008},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2450695037841797},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.14631855487823486},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icde.2008.4497435","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icde.2008.4497435","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2008 IEEE 24th International Conference on Data Engineering","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.110.3089","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.110.3089","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.research.att.com/~marioh/papers/icde08-1.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.4399999976158142,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W1660390307","https://openalex.org/W2001496424","https://openalex.org/W2024605621","https://openalex.org/W2045821558","https://openalex.org/W2067566391","https://openalex.org/W2096598900","https://openalex.org/W2097776316","https://openalex.org/W2103014446","https://openalex.org/W2116544254","https://openalex.org/W2121516976","https://openalex.org/W2127675794","https://openalex.org/W2134206624","https://openalex.org/W2141469207","https://openalex.org/W2161936973","https://openalex.org/W2162102353","https://openalex.org/W2168459716","https://openalex.org/W2169844574","https://openalex.org/W2998852864","https://openalex.org/W6674576723","https://openalex.org/W6675441415","https://openalex.org/W6677177135","https://openalex.org/W6683401941"],"related_works":["https://openalex.org/W2109424811","https://openalex.org/W2375480909","https://openalex.org/W2353314428","https://openalex.org/W2381195555","https://openalex.org/W4246757943","https://openalex.org/W2012019886","https://openalex.org/W2114797768","https://openalex.org/W2380654781","https://openalex.org/W2176214140","https://openalex.org/W2516873349"],"abstract_inverted_index":{"Data":[0],"collections":[1],"often":[2],"have":[3,69,108],"inconsistencies":[4],"that":[5,93,112,152],"arise":[6],"due":[7],"to":[8,17,20,116,134,162],"a":[9,50,57,102],"variety":[10],"of":[11,131,157],"reasons,":[12],"and":[13,22,90,122,169],"it":[14],"is":[15],"desirable":[16],"be":[18,114],"able":[19],"identify":[21],"resolve":[23],"them":[24],"efficiently.":[25,127],"Set":[26],"similarity":[27,46,60,67,86,99,138,160],"queries":[28,126],"are":[29,94],"commonly":[30],"used":[31],"in":[32,56,72,101,167],"data":[33,76],"cleaning":[34,77],"for":[35,75,97,124,136],"matching":[36],"similar":[37],"data.":[38],"In":[39,79],"this":[40,80],"work":[41,81,135],"we":[42,82],"concentrate":[43,83],"on":[44,84,148],"set":[45,66,98,137],"selection":[47,139],"queries:":[48],"Given":[49],"query":[51],"set,":[52],"retrieve":[53],"all":[54],"sets":[55],"collection":[58],"with":[59],"greater":[61],"than":[62],"some":[63],"threshold.":[64],"Various":[65],"measures":[68,161],"been":[70],"proposed":[71],"the":[73,149,154,158,164],"past":[74],"purposes.":[78],"weighted":[85],"functions":[87],"like":[88],"TF/IDF,":[89],"introduce":[91,143],"variants":[92,107],"well":[95],"suited":[96],"selections":[100],"relational":[103],"database":[104],"context.":[105],"These":[106],"special":[109],"semantic":[110,155],"properties":[111,156],"can":[113],"exploited":[115],"design":[117],"very":[118],"efficient":[119],"index":[120],"structures":[121],"algorithms":[123,146],"answering":[125],"We":[128,141],"present":[129],"modifications":[130],"existing":[132],"technologies":[133],"queries.":[140],"also":[142],"three":[144],"novel":[145],"based":[147],"Threshold":[150],"Algorithm,":[151],"exploit":[153],"new":[159],"achieve":[163],"best":[165],"performance":[166],"theory":[168],"practice.":[170]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2020,"cited_by_count":2},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":3},{"year":2017,"cited_by_count":4},{"year":2016,"cited_by_count":2},{"year":2015,"cited_by_count":12},{"year":2014,"cited_by_count":11},{"year":2013,"cited_by_count":7},{"year":2012,"cited_by_count":8}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
