{"id":"https://openalex.org/W2120908895","doi":"https://doi.org/10.1145/2247596.2247642","title":"<i>CRSI</i>","display_name":"<i>CRSI</i>","publication_year":2012,"publication_date":"2012-03-27","ids":{"openalex":"https://openalex.org/W2120908895","doi":"https://doi.org/10.1145/2247596.2247642","mag":"2120908895"},"language":"en","primary_location":{"id":"doi:10.1145/2247596.2247642","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2247596.2247642","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 15th International Conference on Extending Database Technology","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067988262","display_name":"Petros Venetis","orcid":null},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Petros Venetis","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048704801","display_name":"Yannis Sismanis","orcid":null},"institutions":[{"id":"https://openalex.org/I4210085935","display_name":"IBM Research - Almaden","ror":"https://ror.org/005w8dd04","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210085935","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yannis Sismanis","raw_affiliation_strings":["IBM Research -- Almaden"],"affiliations":[{"raw_affiliation_string":"IBM Research -- Almaden","institution_ids":["https://openalex.org/I4210085935"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036119025","display_name":"Berthold Reinwald","orcid":null},"institutions":[{"id":"https://openalex.org/I4210085935","display_name":"IBM Research - Almaden","ror":"https://ror.org/005w8dd04","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210085935","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Berthold Reinwald","raw_affiliation_strings":["IBM Research -- Almaden"],"affiliations":[{"raw_affiliation_string":"IBM Research -- Almaden","institution_ids":["https://openalex.org/I4210085935"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5067988262"],"corresponding_institution_ids":["https://openalex.org/I97018004"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.18205194,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"384","last_page":"395"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7930809259414673},{"id":"https://openalex.org/keywords/jaccard-index","display_name":"Jaccard index","score":0.7767307758331299},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.6754997968673706},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6243143081665039},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.6084400415420532},{"id":"https://openalex.org/keywords/multiset","display_name":"Multiset","score":0.5121176242828369},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5099622011184692},{"id":"https://openalex.org/keywords/estimator","display_name":"Estimator","score":0.49887561798095703},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.44741290807724},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.4405848979949951},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4292064607143402},{"id":"https://openalex.org/keywords/inverted-index","display_name":"Inverted index","score":0.42483022809028625},{"id":"https://openalex.org/keywords/a-priori-and-a-posteriori","display_name":"A priori and a posteriori","score":0.42383524775505066},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3688540458679199},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.2701890170574188},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.17100238800048828},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.13825348019599915},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.13066574931144714},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.1252826750278473}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7930809259414673},{"id":"https://openalex.org/C203519979","wikidata":"https://www.wikidata.org/wiki/Q865360","display_name":"Jaccard index","level":3,"score":0.7767307758331299},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.6754997968673706},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6243143081665039},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6084400415420532},{"id":"https://openalex.org/C2779623528","wikidata":"https://www.wikidata.org/wiki/Q864377","display_name":"Multiset","level":2,"score":0.5121176242828369},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5099622011184692},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.49887561798095703},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.44741290807724},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4405848979949951},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4292064607143402},{"id":"https://openalex.org/C130590232","wikidata":"https://www.wikidata.org/wiki/Q1671754","display_name":"Inverted index","level":3,"score":0.42483022809028625},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.42383524775505066},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3688540458679199},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2701890170574188},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.17100238800048828},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.13825348019599915},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.13066574931144714},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.1252826750278473},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2247596.2247642","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2247596.2247642","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 15th International Conference on Extending Database Technology","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.4099999964237213,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W1502916507","https://openalex.org/W1660390307","https://openalex.org/W1785933978","https://openalex.org/W1979819093","https://openalex.org/W1991800036","https://openalex.org/W1992363839","https://openalex.org/W2012833704","https://openalex.org/W2040546864","https://openalex.org/W2085922539","https://openalex.org/W2096598900","https://openalex.org/W2097184821","https://openalex.org/W2097332592","https://openalex.org/W2097776316","https://openalex.org/W2105436061","https://openalex.org/W2109803107","https://openalex.org/W2114353347","https://openalex.org/W2115215982","https://openalex.org/W2121516976","https://openalex.org/W2127675794","https://openalex.org/W2142385580","https://openalex.org/W2147717514","https://openalex.org/W2152565070","https://openalex.org/W2154610494","https://openalex.org/W2161443453","https://openalex.org/W2295428206","https://openalex.org/W2397770138","https://openalex.org/W4247849388"],"related_works":["https://openalex.org/W2321155245","https://openalex.org/W2058987221","https://openalex.org/W2145657320","https://openalex.org/W2128056631","https://openalex.org/W2102493899","https://openalex.org/W1576730743","https://openalex.org/W2388346754","https://openalex.org/W4282568653","https://openalex.org/W2399928914","https://openalex.org/W2123678380"],"abstract_inverted_index":{"We":[0,64,147],"propose":[1],"a":[2,60,144],"similarity":[3,15,48,56,62,70,77],"index":[4,71,88,112],"for":[5,11,23,54,84],"set-valued":[6],"features":[7],"and":[8,30,49,97,123,131,165],"study":[9],"algorithms":[10,99,133,154,164],"executing":[12],"various":[13],"set":[14],"queries":[16,20],"on":[17,46,59,80],"it.":[18],"Such":[19],"are":[21],"fundamental":[22],"many":[24],"application":[25],"areas,":[26],"including":[27],"data":[28,32],"integration":[29],"cleaning,":[31],"profiling":[33],"as":[34,36],"well":[35],"near":[37],"duplicate":[38],"document":[39],"detection.":[40],"In":[41],"this":[42,69],"paper,":[43],"we":[44,158],"focus":[45],"Jaccard":[47],"present":[50],"estimators":[51],"that":[52,140],"work":[53],"arbitrary":[55],"thresholds":[57],"based":[58,79],"single":[61],"index.":[63],"show":[65],"how":[66],"to":[67,105,125],"build":[68],"a-priori,":[72],"without":[73],"knowledge":[74],"about":[75],"query":[76,109],"thresholds,":[78],"recently":[81],"proposed":[82],"synopses":[83],"multiset":[85],"operations.":[86],"The":[87,111],"is":[89,118,141],"deployed":[90],"using":[91],"existing":[92],"disk-based":[93],"inverted":[94],"indexing":[95],"implementations":[96],"our":[98,153],"exploit":[100],"available":[101],"techniques,":[102],"like":[103],"skip-lists,":[104],"further":[106],"optimize":[107],"the":[108,132,149,160,163],"performance.":[110],"has":[113],"provably":[114],"small":[115],"space":[116],"footprints,":[117],"orders":[119],"of":[120,152,162],"magnitude":[121],"smaller":[122],"faster":[124],"create/incrementally":[126],"maintain":[127],"than":[128],"exact":[129],"solutions,":[130],"provide":[134],"approximate":[135],"answers,":[136],"with":[137],"an":[138],"error":[139,150],"controlled":[142],"by":[143],"user-specified":[145],"parameter.":[146],"prove":[148],"bounds":[151],"analytically,":[155],"and,":[156],"finally,":[157],"demonstrate":[159],"performance":[161],"verify":[166],"their":[167],"accuracy":[168],"experimentally.":[169]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2021,"cited_by_count":2},{"year":2019,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2016-06-24T00:00:00"}
