{"id":"https://openalex.org/W4378770496","doi":"https://doi.org/10.1145/3580305.3599314","title":"DotHash: Estimating Set Similarity Metrics for Link Prediction and Document Deduplication","display_name":"DotHash: Estimating Set Similarity Metrics for Link Prediction and Document Deduplication","publication_year":2023,"publication_date":"2023-08-04","ids":{"openalex":"https://openalex.org/W4378770496","doi":"https://doi.org/10.1145/3580305.3599314"},"language":"en","primary_location":{"id":"doi:10.1145/3580305.3599314","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3580305.3599314","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3580305.3599314","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3580305.3599314","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025100790","display_name":"Igor Nunes","orcid":"https://orcid.org/0000-0002-8443-4708"},"institutions":[{"id":"https://openalex.org/I204250578","display_name":"University of California, Irvine","ror":"https://ror.org/04gyf1771","country_code":"US","type":"education","lineage":["https://openalex.org/I204250578"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Igor Nunes","raw_affiliation_strings":["University of California, Irvine, Irvine, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Irvine, Irvine, CA, USA","institution_ids":["https://openalex.org/I204250578"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078316507","display_name":"Mike Heddes","orcid":"https://orcid.org/0000-0002-9276-458X"},"institutions":[{"id":"https://openalex.org/I204250578","display_name":"University of California, Irvine","ror":"https://ror.org/04gyf1771","country_code":"US","type":"education","lineage":["https://openalex.org/I204250578"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mike Heddes","raw_affiliation_strings":["University of California, Irvine, Irvine, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Irvine, Irvine, CA, USA","institution_ids":["https://openalex.org/I204250578"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048976468","display_name":"Pere Verg\u00e9s","orcid":"https://orcid.org/0000-0002-4109-1071"},"institutions":[{"id":"https://openalex.org/I204250578","display_name":"University of California, Irvine","ror":"https://ror.org/04gyf1771","country_code":"US","type":"education","lineage":["https://openalex.org/I204250578"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Pere Verg\u00e9s","raw_affiliation_strings":["University of California, Irvine, Irvine, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Irvine, Irvine, CA, USA","institution_ids":["https://openalex.org/I204250578"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087277377","display_name":"Danny Abraham","orcid":"https://orcid.org/0009-0000-3857-8826"},"institutions":[{"id":"https://openalex.org/I204250578","display_name":"University of California, Irvine","ror":"https://ror.org/04gyf1771","country_code":"US","type":"education","lineage":["https://openalex.org/I204250578"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Danny Abraham","raw_affiliation_strings":["University of California, Irvine, Irvine, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Irvine, Irvine, CA, USA","institution_ids":["https://openalex.org/I204250578"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079080588","display_name":"Alex Veidenbaum","orcid":null},"institutions":[{"id":"https://openalex.org/I204250578","display_name":"University of California, Irvine","ror":"https://ror.org/04gyf1771","country_code":"US","type":"education","lineage":["https://openalex.org/I204250578"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alex Veidenbaum","raw_affiliation_strings":["University of California, Irvine, Irvine, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Irvine, Irvine, CA, USA","institution_ids":["https://openalex.org/I204250578"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048632637","display_name":"Alex Nicolau","orcid":"https://orcid.org/0009-0003-9833-8455"},"institutions":[{"id":"https://openalex.org/I204250578","display_name":"University of California, Irvine","ror":"https://ror.org/04gyf1771","country_code":"US","type":"education","lineage":["https://openalex.org/I204250578"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alex Nicolau","raw_affiliation_strings":["University of California, Irvine, Irvine, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Irvine, Irvine, CA, USA","institution_ids":["https://openalex.org/I204250578"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5019614336","display_name":"Tony Givargis","orcid":"https://orcid.org/0000-0002-1608-9324"},"institutions":[{"id":"https://openalex.org/I204250578","display_name":"University of California, Irvine","ror":"https://ror.org/04gyf1771","country_code":"US","type":"education","lineage":["https://openalex.org/I204250578"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tony Givargis","raw_affiliation_strings":["University of California, Irvine, Irvine, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Irvine, Irvine, CA, USA","institution_ids":["https://openalex.org/I204250578"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5025100790"],"corresponding_institution_ids":["https://openalex.org/I204250578"],"apc_list":null,"apc_paid":null,"fwci":0.8224,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.69306591,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1758","last_page":"1769"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10064","display_name":"Complex Network Analysis Techniques","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/3109","display_name":"Statistical and Nonlinear Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10064","display_name":"Complex Network Analysis Techniques","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/3109","display_name":"Statistical and Nonlinear Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11819","display_name":"Data-Driven Disease Surveillance","score":0.9717000126838684,"subfield":{"id":"https://openalex.org/subfields/2713","display_name":"Epidemiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/jaccard-index","display_name":"Jaccard index","score":0.9571873545646667},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7330207824707031},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.6588799953460693},{"id":"https://openalex.org/keywords/estimator","display_name":"Estimator","score":0.6484748125076294},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6271507740020752},{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.58341383934021},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5679001212120056},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.5139636397361755},{"id":"https://openalex.org/keywords/index","display_name":"Index (typography)","score":0.4477607011795044},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4456782042980194},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3390467166900635},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.2638382911682129},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.24271327257156372},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.23663267493247986},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1946171522140503},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.17283537983894348},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.1367703080177307}],"concepts":[{"id":"https://openalex.org/C203519979","wikidata":"https://www.wikidata.org/wiki/Q865360","display_name":"Jaccard index","level":3,"score":0.9571873545646667},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7330207824707031},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6588799953460693},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.6484748125076294},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6271507740020752},{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.58341383934021},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5679001212120056},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.5139636397361755},{"id":"https://openalex.org/C2777382242","wikidata":"https://www.wikidata.org/wiki/Q6017816","display_name":"Index (typography)","level":2,"score":0.4477607011795044},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4456782042980194},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3390467166900635},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2638382911682129},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.24271327257156372},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.23663267493247986},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1946171522140503},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.17283537983894348},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.1367703080177307},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3580305.3599314","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3580305.3599314","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3580305.3599314","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2305.17310","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.17310","pdf_url":"https://arxiv.org/pdf/2305.17310","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3580305.3599314","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3580305.3599314","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3580305.3599314","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4378770496.pdf","grobid_xml":"https://content.openalex.org/works/W4378770496.grobid-xml"},"referenced_works_count":87,"referenced_works":["https://openalex.org/W158057341","https://openalex.org/W177038005","https://openalex.org/W188608978","https://openalex.org/W1533841329","https://openalex.org/W1560267311","https://openalex.org/W1573190978","https://openalex.org/W1573706465","https://openalex.org/W1615576556","https://openalex.org/W1736726159","https://openalex.org/W1967784045","https://openalex.org/W1979104937","https://openalex.org/W2011015278","https://openalex.org/W2012833704","https://openalex.org/W2024932032","https://openalex.org/W2032448673","https://openalex.org/W2055906546","https://openalex.org/W2066692087","https://openalex.org/W2069005511","https://openalex.org/W2070862086","https://openalex.org/W2081193615","https://openalex.org/W2114060717","https://openalex.org/W2118947057","https://openalex.org/W2123402141","https://openalex.org/W2123427850","https://openalex.org/W2126907894","https://openalex.org/W2132069633","https://openalex.org/W2137939096","https://openalex.org/W2143996849","https://openalex.org/W2144211451","https://openalex.org/W2145349611","https://openalex.org/W2146591355","https://openalex.org/W2147717514","https://openalex.org/W2149055314","https://openalex.org/W2154454189","https://openalex.org/W2169150382","https://openalex.org/W2269316186","https://openalex.org/W2308071406","https://openalex.org/W2400908863","https://openalex.org/W2420733993","https://openalex.org/W2508837377","https://openalex.org/W2553570933","https://openalex.org/W2555576367","https://openalex.org/W2564084673","https://openalex.org/W2591634218","https://openalex.org/W2609517418","https://openalex.org/W2736973865","https://openalex.org/W2785764160","https://openalex.org/W2792591449","https://openalex.org/W2890276152","https://openalex.org/W2890715498","https://openalex.org/W2900569176","https://openalex.org/W2948646149","https://openalex.org/W2950150251","https://openalex.org/W2950287694","https://openalex.org/W2955798121","https://openalex.org/W2964694902","https://openalex.org/W2988916019","https://openalex.org/W2999817249","https://openalex.org/W3021975806","https://openalex.org/W3026640598","https://openalex.org/W3028444035","https://openalex.org/W3030994385","https://openalex.org/W3032503335","https://openalex.org/W3043995050","https://openalex.org/W3083028011","https://openalex.org/W3100078588","https://openalex.org/W3102219154","https://openalex.org/W3111141572","https://openalex.org/W3125044119","https://openalex.org/W3150644974","https://openalex.org/W3160872503","https://openalex.org/W3175308985","https://openalex.org/W3177765786","https://openalex.org/W3210084676","https://openalex.org/W4229641819","https://openalex.org/W4230940751","https://openalex.org/W4235571646","https://openalex.org/W4237589487","https://openalex.org/W4281260224","https://openalex.org/W4282570649","https://openalex.org/W4288101963","https://openalex.org/W4295312788","https://openalex.org/W4299841484","https://openalex.org/W4310895557","https://openalex.org/W4398623896","https://openalex.org/W6698240980","https://openalex.org/W6756129506"],"related_works":["https://openalex.org/W4396220545","https://openalex.org/W1994775821","https://openalex.org/W2012019886","https://openalex.org/W2091133150","https://openalex.org/W2331322489","https://openalex.org/W2945869148","https://openalex.org/W2611195251","https://openalex.org/W2398781203","https://openalex.org/W2009279505","https://openalex.org/W4206503171"],"abstract_inverted_index":{"Metrics":[0],"for":[1,21,94,135,146],"set":[2,95],"similarity":[3,74,96],"are":[4,106],"a":[5,18,23,40,181],"core":[6],"aspect":[7],"of":[8,34,58,66,83,121,130,150,166,183,191,199],"several":[9],"data":[10,67,122],"mining":[11],"tasks.":[12],"To":[13],"remove":[14],"duplicate":[15,225],"results":[16,209],"in":[17,54,109,220],"Web":[19],"search,":[20],"example,":[22],"common":[24],"approach":[25],"looks":[26],"at":[27,85],"the":[28,44,55,63,72,128,133,147,159,164,170,177,197,217,228],"Jaccard":[29,160],"index":[30,161,179],"between":[31,75],"all":[32,76],"pairs":[33,77],"pages.":[35],"In":[36],"social":[37],"network":[38],"analysis,":[39],"much-celebrated":[41],"metric":[42],"is":[43,138,169,213],"Adamic-Adar":[45,178],"index,":[46],"widely":[47],"used":[48,108,156],"to":[49,68,124,157,163],"compare":[50],"node":[51],"neighborhood":[52],"sets":[53],"important":[56],"problem":[57],"predicting":[59],"links.":[60],"However,":[61],"with":[62,227],"increasing":[64],"amount":[65],"be":[69,79,125,155],"processed,":[70],"calculating":[71],"exact":[73],"can":[78,154,174],"intractable.":[80],"The":[81,98],"challenge":[82],"working":[84],"this":[86,189],"scale":[87],"has":[88],"motivated":[89],"research":[90],"into":[91],"efficient":[92],"estimators":[93,137,219],"metrics.":[97,185],"two":[99,151],"most":[100],"popular":[101],"estimators,":[102],"MinHash":[103],"and":[104,115,180,202,223,231],"SimHash,":[105],"indeed":[107],"applications":[110],"such":[111],"as":[112],"document":[113],"deduplication":[114],"recommender":[116],"systems":[117],"where":[118],"large":[119],"volumes":[120],"need":[123],"processed.":[126],"Given":[127],"importance":[129],"these":[131],"tasks,":[132],"demand":[134],"advancing":[136],"evident.":[139],"We":[140,186],"propose":[141],"DotHash,":[142],"an":[143],"unbiased":[144],"estimator":[145],"intersection":[148],"size":[149],"sets.":[152],"DotHash":[153,212],"estimate":[158,176,200],"and,":[162],"best":[165],"our":[167],"knowledge,":[168],"first":[171],"method":[172],"that":[173,211],"also":[175],"family":[182,190],"related":[184],"formally":[187],"define":[188],"metrics,":[192],"provide":[193],"theoretical":[194],"bounds":[195],"on":[196],"probability":[198],"errors,":[201],"analyze":[203],"its":[204],"empirical":[205],"performance.":[206],"Our":[207],"experimental":[208],"indicate":[210],"more":[214],"accurate":[215],"than":[216],"other":[218],"link":[221],"prediction":[222],"detecting":[224],"documents":[226],"same":[229],"complexity":[230],"similar":[232],"comparison":[233],"time.":[234]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
