{"id":"https://openalex.org/W2883952940","doi":"https://doi.org/10.14778/3231751.3231760","title":"Set similarity joins on mapreduce","display_name":"Set similarity joins on mapreduce","publication_year":2018,"publication_date":"2018-06-01","ids":{"openalex":"https://openalex.org/W2883952940","doi":"https://doi.org/10.14778/3231751.3231760","mag":"2883952940"},"language":"en","primary_location":{"id":"doi:10.14778/3231751.3231760","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3231751.3231760","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5028799926","display_name":"Fabian Fier","orcid":null},"institutions":[{"id":"https://openalex.org/I39343248","display_name":"Humboldt-Universit\u00e4t zu Berlin","ror":"https://ror.org/01hcx6992","country_code":"DE","type":"education","lineage":["https://openalex.org/I39343248"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Fabian Fier","raw_affiliation_strings":["Humboldt-Universit\u00e4t zu Berlin, Berlin, Germany"],"affiliations":[{"raw_affiliation_string":"Humboldt-Universit\u00e4t zu Berlin, Berlin, Germany","institution_ids":["https://openalex.org/I39343248"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089328860","display_name":"Nikolaus Augsten","orcid":"https://orcid.org/0000-0002-3036-6201"},"institutions":[{"id":"https://openalex.org/I182212641","display_name":"University of Salzburg","ror":"https://ror.org/05gs8cd61","country_code":"AT","type":"education","lineage":["https://openalex.org/I182212641"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Nikolaus Augsten","raw_affiliation_strings":["Universit\u00e4t Salzburg, Salzburg, Austria"],"affiliations":[{"raw_affiliation_string":"Universit\u00e4t Salzburg, Salzburg, Austria","institution_ids":["https://openalex.org/I182212641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016904523","display_name":"Panagiotis Bouros","orcid":"https://orcid.org/0000-0002-8846-4330"},"institutions":[{"id":"https://openalex.org/I197323543","display_name":"Johannes Gutenberg University Mainz","ror":"https://ror.org/023b0x485","country_code":"DE","type":"education","lineage":["https://openalex.org/I197323543"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Panagiotis Bouros","raw_affiliation_strings":["Johannes Gutenberg University Mainz, Mainz, Germany"],"affiliations":[{"raw_affiliation_string":"Johannes Gutenberg University Mainz, Mainz, Germany","institution_ids":["https://openalex.org/I197323543"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055236937","display_name":"Ulf Leser","orcid":"https://orcid.org/0000-0003-2166-9582"},"institutions":[{"id":"https://openalex.org/I39343248","display_name":"Humboldt-Universit\u00e4t zu Berlin","ror":"https://ror.org/01hcx6992","country_code":"DE","type":"education","lineage":["https://openalex.org/I39343248"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Ulf Leser","raw_affiliation_strings":["Humboldt-Universit\u00e4t zu Berlin, Berlin, Germany"],"affiliations":[{"raw_affiliation_string":"Humboldt-Universit\u00e4t zu Berlin, Berlin, Germany","institution_ids":["https://openalex.org/I39343248"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113500582","display_name":"Johann-Christoph Freytag","orcid":null},"institutions":[{"id":"https://openalex.org/I39343248","display_name":"Humboldt-Universit\u00e4t zu Berlin","ror":"https://ror.org/01hcx6992","country_code":"DE","type":"education","lineage":["https://openalex.org/I39343248"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Johann-Christoph Freytag","raw_affiliation_strings":["Humboldt-Universit\u00e4t zu Berlin, Berlin, Germany"],"affiliations":[{"raw_affiliation_string":"Humboldt-Universit\u00e4t zu Berlin, Berlin, Germany","institution_ids":["https://openalex.org/I39343248"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5028799926"],"corresponding_institution_ids":["https://openalex.org/I39343248"],"apc_list":null,"apc_paid":null,"fwci":7.834,"has_fulltext":false,"cited_by_count":46,"citation_normalized_percentile":{"value":0.97552861,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"11","issue":"10","first_page":"1110","last_page":"1122"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9585999846458435,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/joins","display_name":"Joins","score":0.9042567014694214},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8011124730110168},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.7448878288269043},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6839219331741333},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.6091489195823669},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.4665149450302124},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4620136320590973},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.4499663710594177},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.41407883167266846},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.4129060208797455},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3488868474960327},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.31307655572891235},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2684094309806824}],"concepts":[{"id":"https://openalex.org/C2778692605","wikidata":"https://www.wikidata.org/wiki/Q4041866","display_name":"Joins","level":2,"score":0.9042567014694214},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8011124730110168},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.7448878288269043},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6839219331741333},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6091489195823669},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.4665149450302124},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4620136320590973},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.4499663710594177},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.41407883167266846},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4129060208797455},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3488868474960327},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31307655572891235},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2684094309806824},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3231751.3231760","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3231751.3231760","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"No poverty","id":"https://metadata.un.org/sdg/1","score":0.6600000262260437}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1598064945","https://openalex.org/W1950295237","https://openalex.org/W1973001156","https://openalex.org/W1987562803","https://openalex.org/W1991516927","https://openalex.org/W2001700730","https://openalex.org/W2002597960","https://openalex.org/W2033629528","https://openalex.org/W2061601738","https://openalex.org/W2065259291","https://openalex.org/W2073167797","https://openalex.org/W2086322024","https://openalex.org/W2093050254","https://openalex.org/W2096598900","https://openalex.org/W2097184821","https://openalex.org/W2097776316","https://openalex.org/W2099194763","https://openalex.org/W2104599107","https://openalex.org/W2109803107","https://openalex.org/W2115500858","https://openalex.org/W2121516976","https://openalex.org/W2132399973","https://openalex.org/W2137139422","https://openalex.org/W2151930506","https://openalex.org/W2166400748","https://openalex.org/W2169387919","https://openalex.org/W2169795703","https://openalex.org/W2173213060","https://openalex.org/W2270660075","https://openalex.org/W2294331997","https://openalex.org/W2396588571","https://openalex.org/W2518843341","https://openalex.org/W2521888566","https://openalex.org/W2616120155","https://openalex.org/W2752274179"],"related_works":["https://openalex.org/W2883952940","https://openalex.org/W2762277149","https://openalex.org/W2366051640","https://openalex.org/W2991126413","https://openalex.org/W3043816525","https://openalex.org/W2250140425","https://openalex.org/W199645745","https://openalex.org/W1998797251","https://openalex.org/W2275391457","https://openalex.org/W2734587838"],"abstract_inverted_index":{"Set":[0],"similarity":[1,36,92,154],"joins,":[2],"which":[3],"compute":[4],"pairs":[5],"of":[6,17,26,53,64,123,182],"similar":[7],"sets,":[8,149],"constitute":[9],"an":[10],"important":[11],"operator":[12],"primitive":[13],"in":[14,71,106,133,175,209],"a":[15,61,107,120,128,157,176],"variety":[16],"applications,":[18],"including":[19],"applications":[20],"that":[21,114,170],"must":[22],"process":[23],"large":[24,69],"amounts":[25],"data.":[27],"To":[28],"handle":[29,166],"these":[30,54],"data":[31],"volumes,":[32],"several":[33],"distributed":[34,90],"set":[35,91,151],"join":[37,93],"algorithms":[38,105,132,162,184],"have":[39],"been":[40],"proposed.":[41],"Unfortunately,":[42],"little":[43],"is":[44],"known":[45],"about":[46],"the":[47,68,72,98,104,167,183,186,189,210],"relative":[48],"performance,":[49],"strengths":[50],"and":[51,67,118,144,192],"weaknesses":[52],"techniques.":[55],"Previous":[56],"comparisons":[57],"are":[58,145],"limited":[59],"to":[60,79,137,147,165],"small":[62,168],"subset":[63],"relevant":[65],"algorithms,":[66,94],"differences":[70],"various":[73],"test":[74,109,135],"setups":[75],"make":[76],"it":[77],"hard":[78],"draw":[80],"overall":[81],"conclusions.":[82],"In":[83],"this":[84],"paper":[85],"we":[86,203],"survey":[87],"ten":[88],"recent,":[89],"all":[95],"based":[96],"on":[97,111,200],"MapReduce":[99],"paradigm.":[100],"We":[101],"empirically":[102],"compare":[103],"uniform":[108],"environment":[110],"twelve":[112],"datasets":[113,169],"expose":[115],"different":[116],"characteristics":[117],"represent":[119],"broad":[121],"range":[122],"applications.":[124],"Our":[125,179],"experiments":[126,194],"yield":[127],"surprising":[129],"result:":[130],"All":[131],"our":[134,196,201],"fail":[136,164],"scale":[138],"for":[139,188,206],"at":[140],"least":[141],"one":[142],"dataset":[143],"sensitive":[146],"long":[148],"frequent":[150],"elements,":[152],"low":[153],"thresholds,":[155],"or":[156],"combination":[158],"thereof.":[159],"Interestingly,":[160],"some":[161],"even":[163],"can":[171],"easily":[172],"be":[173],"processed":[174],"non-distributed":[177],"setting.":[178],"analytic":[180,197],"investigation":[181],"pinpoints":[185],"reasons":[187],"poor":[190],"performance":[191],"targeted":[193],"confirm":[195],"findings.":[198],"Based":[199],"investigation,":[202],"suggest":[204],"directions":[205],"future":[207],"research":[208],"area.":[211]},"counts_by_year":[{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":12},{"year":2020,"cited_by_count":10},{"year":2019,"cited_by_count":12}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
