{"id":"https://openalex.org/W2795419443","doi":"https://doi.org/10.1145/3196959.3196985","title":"Set Similarity Search for Skewed Data","display_name":"Set Similarity Search for Skewed Data","publication_year":2018,"publication_date":"2018-05-15","ids":{"openalex":"https://openalex.org/W2795419443","doi":"https://doi.org/10.1145/3196959.3196985","mag":"2795419443"},"language":"en","primary_location":{"id":"doi:10.1145/3196959.3196985","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3196959.3196985","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 37th ACM SIGMOD-SIGACT-SIGAI Symposium on Principles of Database Systems","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://pure.itu.dk/portal/da/publications/33c0c7ce-1045-4885-90c9-cc77e5528b53","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006653893","display_name":"Samuel McCauley","orcid":"https://orcid.org/0000-0001-8196-9662"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Samuel McCauley","raw_affiliation_strings":["BARC and IT U. Copenhagen, Copenhagen, Denmark"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"BARC and IT U. Copenhagen, Copenhagen, Denmark","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027425067","display_name":"Jesper W. Mikkelsen","orcid":"https://orcid.org/0000-0002-6905-5953"},"institutions":[{"id":"https://openalex.org/I83467386","display_name":"IT University of Copenhagen","ror":"https://ror.org/02309jg23","country_code":"DK","type":"education","lineage":["https://openalex.org/I83467386"]}],"countries":["DK"],"is_corresponding":false,"raw_author_name":"Jesper W. Mikkelsen","raw_affiliation_strings":["IT U. Copenhagen, Copenhagen, Denmark"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IT U. Copenhagen, Copenhagen, Denmark","institution_ids":["https://openalex.org/I83467386"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5014293815","display_name":"Rasmus Pagh","orcid":"https://orcid.org/0000-0002-1516-9306"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rasmus Pagh","raw_affiliation_strings":["BARC and IT U. Copenhagen, Copenhagen, Denmark"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"BARC and IT U. Copenhagen, Copenhagen, Denmark","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.02710565,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"63","last_page":"74"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11303","display_name":"Bayesian Modeling and Causal Inference","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/nearest-neighbor-search","display_name":"Nearest neighbor search","score":0.6971504092216492},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6936987638473511},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.6460697650909424},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5912651419639587},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5439288020133972},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.5350794196128845},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.48865583539009094},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.4157962501049042},{"id":"https://openalex.org/keywords/intersection","display_name":"Intersection (aeronautics)","score":0.4118891954421997},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.3721546530723572},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3258165121078491},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.25567108392715454},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.09095785021781921}],"concepts":[{"id":"https://openalex.org/C116738811","wikidata":"https://www.wikidata.org/wiki/Q608751","display_name":"Nearest neighbor search","level":2,"score":0.6971504092216492},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6936987638473511},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.6460697650909424},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5912651419639587},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5439288020133972},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.5350794196128845},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.48865583539009094},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.4157962501049042},{"id":"https://openalex.org/C64543145","wikidata":"https://www.wikidata.org/wiki/Q162942","display_name":"Intersection (aeronautics)","level":2,"score":0.4118891954421997},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3721546530723572},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3258165121078491},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.25567108392715454},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.09095785021781921},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C146978453","wikidata":"https://www.wikidata.org/wiki/Q3798668","display_name":"Aerospace engineering","level":1,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.1145/3196959.3196985","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3196959.3196985","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 37th ACM SIGMOD-SIGACT-SIGAI Symposium on Principles of Database Systems","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.atira.dk:openaire/33c0c7ce-1045-4885-90c9-cc77e5528b53","is_oa":true,"landing_page_url":"https://pure.itu.dk/portal/da/publications/33c0c7ce-1045-4885-90c9-cc77e5528b53","pdf_url":null,"source":{"id":"https://openalex.org/S4377196680","display_name":"IT University Of Copenhagen (IT University of Copenhagen)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I83467386","host_organization_name":"IT University of Copenhagen","host_organization_lineage":["https://openalex.org/I83467386"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"McCauley, S, Mikkelsen, J W & Pagh, R 2018, Set Similarity Search for Skewed Data. in Proceedings of Principles of Database Systems (PODS). Association for Computing Machinery, pp. 63-74. https://doi.org/10.1145/3196959.3196985","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:arXiv.org:1804.03054","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1804.03054","pdf_url":"https://arxiv.org/pdf/1804.03054","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:2795419443","is_oa":true,"landing_page_url":"http://export.arxiv.org/pdf/1804.03054","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.1804.03054","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1804.03054","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"pmh:oai:pure.atira.dk:openaire/33c0c7ce-1045-4885-90c9-cc77e5528b53","is_oa":true,"landing_page_url":"https://pure.itu.dk/portal/da/publications/33c0c7ce-1045-4885-90c9-cc77e5528b53","pdf_url":null,"source":{"id":"https://openalex.org/S4377196680","display_name":"IT University Of Copenhagen (IT University of Copenhagen)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I83467386","host_organization_name":"IT University of Copenhagen","host_organization_lineage":["https://openalex.org/I83467386"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"McCauley, S, Mikkelsen, J W & Pagh, R 2018, Set Similarity Search for Skewed Data. in Proceedings of Principles of Database Systems (PODS). Association for Computing Machinery, pp. 63-74. https://doi.org/10.1145/3196959.3196985","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3829048491","display_name":null,"funder_award_id":"FP7/2007-2013","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G4422840520","display_name":null,"funder_award_id":"16582","funder_id":"https://openalex.org/F4320310490","funder_display_name":"Villum Fonden"},{"id":"https://openalex.org/G5593277320","display_name":null,"funder_award_id":"2007-2013","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G5702163051","display_name":null,"funder_award_id":"FP7/2007","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G6396797479","display_name":null,"funder_award_id":"614331","funder_id":"https://openalex.org/F4320338335","funder_display_name":"H2020 European Research Council"},{"id":"https://openalex.org/G7811487381","display_name":"Scalable Similarity Search","funder_award_id":"614331","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"}],"funders":[{"id":"https://openalex.org/F4320310490","display_name":"Villum Fonden","ror":"https://ror.org/007ww2d15"},{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"},{"id":"https://openalex.org/F4320334966","display_name":"Bhabha Atomic Research Centre","ror":"https://ror.org/05w6wfp17"},{"id":"https://openalex.org/F4320338335","display_name":"H2020 European Research Council","ror":"https://ror.org/0472cxd90"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":58,"referenced_works":["https://openalex.org/W566315627","https://openalex.org/W1430582609","https://openalex.org/W1455310343","https://openalex.org/W1581095529","https://openalex.org/W1595409123","https://openalex.org/W1599364940","https://openalex.org/W1647207848","https://openalex.org/W1973001156","https://openalex.org/W1977046819","https://openalex.org/W1989664320","https://openalex.org/W1995725694","https://openalex.org/W1998067572","https://openalex.org/W2010416066","https://openalex.org/W2012833704","https://openalex.org/W2017851434","https://openalex.org/W2080844740","https://openalex.org/W2096598900","https://openalex.org/W2097184821","https://openalex.org/W2097776316","https://openalex.org/W2098900423","https://openalex.org/W2115854352","https://openalex.org/W2121269638","https://openalex.org/W2126754439","https://openalex.org/W2127859122","https://openalex.org/W2132069633","https://openalex.org/W2152565070","https://openalex.org/W2169054943","https://openalex.org/W2218762779","https://openalex.org/W2239648585","https://openalex.org/W2263882035","https://openalex.org/W2268968630","https://openalex.org/W2295744963","https://openalex.org/W2308071406","https://openalex.org/W2396588571","https://openalex.org/W2507428467","https://openalex.org/W2568140450","https://openalex.org/W2574633002","https://openalex.org/W2585199909","https://openalex.org/W2604829000","https://openalex.org/W2612210001","https://openalex.org/W2646549016","https://openalex.org/W2735058673","https://openalex.org/W2949388608","https://openalex.org/W2951683349","https://openalex.org/W2952088295","https://openalex.org/W2952486042","https://openalex.org/W2953209208","https://openalex.org/W2963046172","https://openalex.org/W2963703787","https://openalex.org/W2963886823","https://openalex.org/W2964013013","https://openalex.org/W2964142086","https://openalex.org/W3093544618","https://openalex.org/W3098556943","https://openalex.org/W3105727767","https://openalex.org/W6634770223","https://openalex.org/W6679663036","https://openalex.org/W6698240980"],"related_works":["https://openalex.org/W2963535486","https://openalex.org/W2963886823","https://openalex.org/W129280620","https://openalex.org/W2903672378","https://openalex.org/W3012776781","https://openalex.org/W2001670934","https://openalex.org/W2592901506","https://openalex.org/W3127292082","https://openalex.org/W2803960581","https://openalex.org/W2611719484","https://openalex.org/W2044163187","https://openalex.org/W2619410666","https://openalex.org/W2560303789","https://openalex.org/W2963549845","https://openalex.org/W2952967169","https://openalex.org/W2171129114","https://openalex.org/W2798412430","https://openalex.org/W2950817225","https://openalex.org/W2039612080","https://openalex.org/W3003103334"],"abstract_inverted_index":{"Set":[0],"similarity":[1,11,67,106,160,177,214,261],"join,":[2],"as":[3,5,48,55,88],"well":[4],"the":[6,37,56,112,153,190,200,212,223,288],"corresponding":[7],"indexing":[8,113,245],"problem":[9,114,215],"set":[10,57,66,121,176,213,260],"search,":[12],"are":[13,179],"fundamental":[14],"primitives":[15,25],"for":[16,143,175],"managing":[17],"noisy":[18],"or":[19,53],"uncertain":[20],"data.":[21],"For":[22],"example,":[23],"these":[24],"can":[26,44,69],"be":[27,71],"used":[28,72],"in":[29,61,119,137,148,166,189,230,241,259,275],"data":[30,150,191,219,227,246,291],"cleaning":[31],"to":[32,73,92,104,141,182,210,269,272],"identify":[33,74],"different":[34],"representations":[35],"of":[36,58,115,123,155,159,187,197,206,225,253,290],"same":[38],"object.":[39],"In":[40,108],"many":[41],"cases":[42],"one":[43],"represent":[45],"an":[46,79,144],"object":[47],"a":[49,63,120,129,134,149,217,249],"sparse":[50],"0-1":[51],"vector,":[52],"equivalently":[54],"nonzero":[59],"entries":[60],"such":[62],"vector.":[64],"A":[65],"join":[68],"then":[70],"those":[75],"pairs":[76],"that":[77,184,221,281],"have":[78],"exceptionally":[80],"large":[81,98],"dot":[82],"product":[83],"(or":[84],"intersection,":[85],"when":[86],"viewed":[87],"sets).":[89],"We":[90],"choose":[91],"focus":[93],"on":[94,287],"identifying":[95,116],"vectors":[96,118,124,154,254],"with":[97],"Pearson":[99],"correlation,":[100],"but":[101],"results":[102,234],"extend":[103],"other":[105],"measures.":[107],"particular,":[109],"we":[110,139,279],"consider":[111],"correlated":[117],"S":[122],"sampled":[125],"from":[126],"0,1d.":[127],"Given":[128],"query":[130],"vector":[131,146],"y":[132],"and":[133],"parameter":[135],"alpha":[136],"(0,1),":[138],"need":[140],"search":[142,161,178],"alpha-correlated":[145],"x":[147],"structure":[151,247],"representing":[152],"S.":[156],"This":[157],"kind":[158,224],"has":[162],"been":[163],"intensely":[164],"studied":[165],"worst-case":[167,242],"(non-random":[168],"data)":[169],"settings.":[170,243],"Existing":[171],"theoretically":[172],"well-founded":[173],"methods":[174,265],"often":[180],"inferior":[181],"heuristics":[183],"take":[185],"advantage":[186],"skew":[188,274],"distribution,":[192],"i.e.,":[193],"widely":[194],"differing":[195],"frequencies":[196],"1s":[198],"across":[199],"d":[201],"dimensions.":[202],"The":[203],"main":[204],"contribution":[205],"this":[207],"paper":[208],"is":[209,239,248],"analyze":[211],"under":[216],"random":[218],"model":[220],"reflects":[222],"skewed":[226],"distributions":[228],"seen":[229],"practice,":[231],"allowing":[232],"theoretical":[233],"much":[235],"stronger":[236],"than":[237],"what":[238],"possible":[240],"Our":[244],"recursive,":[250],"data-dependent":[251,264],"partitioning":[252],"inspired":[255],"by":[256],"recent":[257],"advances":[258],"search.":[262],"Previous":[263],"do":[266],"not":[267],"seem":[268],"allow":[270],"us":[271],"exploit":[273],"item":[276],"frequencies,":[277],"so":[278],"believe":[280],"our":[282],"work":[283],"sheds":[284],"further":[285],"light":[286],"power":[289],"dependence.":[292]},"counts_by_year":[],"updated_date":"2026-07-01T08:55:40.977307","created_date":"2025-10-10T00:00:00"}
