{"id":"https://openalex.org/W4306317056","doi":"https://doi.org/10.1145/3511808.3557413","title":"\u2118-MinHash Algorithm for Continuous Probability Measures","display_name":"\u2118-MinHash Algorithm for Continuous Probability Measures","publication_year":2022,"publication_date":"2022-10-16","ids":{"openalex":"https://openalex.org/W4306317056","doi":"https://doi.org/10.1145/3511808.3557413"},"language":"en","primary_location":{"id":"doi:10.1145/3511808.3557413","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3511808.3557413","pdf_url":null,"source":{"id":"https://openalex.org/S4363608762","display_name":"Proceedings of the 31st ACM International Conference on Information &amp; Knowledge Management","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Information &amp; Knowledge Management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100435468","display_name":"Ping Li","orcid":"https://orcid.org/0000-0001-8272-6582"},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ping Li","raw_affiliation_strings":["LinkedIn Corporation, Bellevue, WA, USA"],"affiliations":[{"raw_affiliation_string":"LinkedIn Corporation, Bellevue, WA, USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100781896","display_name":"Xiaoyun Li","orcid":"https://orcid.org/0000-0001-6039-6893"},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaoyun Li","raw_affiliation_strings":["Rutgers University, Piscattway, NJ, USA"],"affiliations":[{"raw_affiliation_string":"Rutgers University, Piscattway, NJ, USA","institution_ids":["https://openalex.org/I102322142"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5051592584","display_name":"Gennady Samorodnitsky","orcid":"https://orcid.org/0000-0001-9947-2574"},"institutions":[{"id":"https://openalex.org/I205783295","display_name":"Cornell University","ror":"https://ror.org/05bnh6r87","country_code":"US","type":"education","lineage":["https://openalex.org/I205783295"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gennady Samorodnitsky","raw_affiliation_strings":["Cornell University, Ithaca, NY, USA"],"affiliations":[{"raw_affiliation_string":"Cornell University, Ithaca, NY, USA","institution_ids":["https://openalex.org/I205783295"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100435468"],"corresponding_institution_ids":["https://openalex.org/I1316064682"],"apc_list":null,"apc_paid":null,"fwci":0.4372,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.61760441,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1177","last_page":"1187"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/jaccard-index","display_name":"Jaccard index","score":0.9774012565612793},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6008586883544922},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.5243958830833435},{"id":"https://openalex.org/keywords/probability-distribution","display_name":"Probability distribution","score":0.5095961093902588},{"id":"https://openalex.org/keywords/hash-function","display_name":"Hash function","score":0.4742816090583801},{"id":"https://openalex.org/keywords/probability-mass-function","display_name":"Probability mass function","score":0.41616716980934143},{"id":"https://openalex.org/keywords/invariant","display_name":"Invariant (physics)","score":0.41207146644592285},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3727228045463562},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3277134895324707},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.3161671757698059},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.22565796971321106},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.18799078464508057},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.17159900069236755}],"concepts":[{"id":"https://openalex.org/C203519979","wikidata":"https://www.wikidata.org/wiki/Q865360","display_name":"Jaccard index","level":3,"score":0.9774012565612793},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6008586883544922},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5243958830833435},{"id":"https://openalex.org/C149441793","wikidata":"https://www.wikidata.org/wiki/Q200726","display_name":"Probability distribution","level":2,"score":0.5095961093902588},{"id":"https://openalex.org/C99138194","wikidata":"https://www.wikidata.org/wiki/Q183427","display_name":"Hash function","level":2,"score":0.4742816090583801},{"id":"https://openalex.org/C197096303","wikidata":"https://www.wikidata.org/wiki/Q869887","display_name":"Probability mass function","level":3,"score":0.41616716980934143},{"id":"https://openalex.org/C190470478","wikidata":"https://www.wikidata.org/wiki/Q2370229","display_name":"Invariant (physics)","level":2,"score":0.41207146644592285},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3727228045463562},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3277134895324707},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3161671757698059},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.22565796971321106},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.18799078464508057},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.17159900069236755},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C37914503","wikidata":"https://www.wikidata.org/wiki/Q156495","display_name":"Mathematical physics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3511808.3557413","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3511808.3557413","pdf_url":null,"source":{"id":"https://openalex.org/S4363608762","display_name":"Proceedings of the 31st ACM International Conference on Information &amp; Knowledge Management","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Information &amp; Knowledge Management","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8333178664","display_name":null,"funder_award_id":"2015242","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":45,"referenced_works":["https://openalex.org/W1437397736","https://openalex.org/W1537946535","https://openalex.org/W1942295288","https://openalex.org/W1949708275","https://openalex.org/W1975517671","https://openalex.org/W1978024959","https://openalex.org/W1998840390","https://openalex.org/W1999092742","https://openalex.org/W2012833704","https://openalex.org/W2018218137","https://openalex.org/W2029852131","https://openalex.org/W2031252661","https://openalex.org/W2040886165","https://openalex.org/W2047756776","https://openalex.org/W2053377618","https://openalex.org/W2096810317","https://openalex.org/W2126907894","https://openalex.org/W2140431670","https://openalex.org/W2142256417","https://openalex.org/W2152228468","https://openalex.org/W2153635508","https://openalex.org/W2168467811","https://openalex.org/W2284900416","https://openalex.org/W2483327705","https://openalex.org/W2560674852","https://openalex.org/W2772632044","https://openalex.org/W2785764160","https://openalex.org/W2891743895","https://openalex.org/W2911038074","https://openalex.org/W2950044153","https://openalex.org/W2951006102","https://openalex.org/W2963517218","https://openalex.org/W2978329087","https://openalex.org/W3083090961","https://openalex.org/W3093698740","https://openalex.org/W3102219154","https://openalex.org/W3117993946","https://openalex.org/W3132833448","https://openalex.org/W3155434787","https://openalex.org/W3175308985","https://openalex.org/W4213245422","https://openalex.org/W4247345911","https://openalex.org/W4306317216","https://openalex.org/W4310895557","https://openalex.org/W6691585749"],"related_works":["https://openalex.org/W4249885815","https://openalex.org/W1575718162","https://openalex.org/W1850854912","https://openalex.org/W1534282248","https://openalex.org/W2136538477","https://openalex.org/W1976517975","https://openalex.org/W4285149740","https://openalex.org/W253462378","https://openalex.org/W2963025983","https://openalex.org/W4247405058"],"abstract_inverted_index":{"This":[0],"paper":[1],"studies":[2],"the":[3,33,48,52,85,114,133,141],"scale-invariant":[4],"\"probability":[5],"Jaccard''":[6],"(ProbJ),":[7],"noted":[8],"as":[9],"\u2110\u2118,":[10],"which":[11,100],"is":[12,26,58,124],"another":[13],"variant":[14],"of":[15,29,50,87,143],"weighted":[16],"Jaccard":[17,24,35],"similarity.":[18],"The":[19],"standard":[20],"and":[21,65,80,93,107,137,139],"commonly":[22],"used":[23],"index":[25],"not":[27],"invariant":[28],"data":[30],"scaling.":[31],"Thus,":[32],"probability":[34,43,56],"can":[36],"be":[37],"a":[38,95,119,127],"potentially":[39],"useful":[40],"extension":[41],"to":[42],"distributions.":[44],"Before":[45],"our":[46,73,144],"paper,":[47],"problem":[49,78],"hashing":[51],"\u2110\u2118":[53,88,109],"for":[54],"continuous":[55,90],"measures":[57],"an":[59],"open":[60],"problem,":[61],"where":[62],"rigorous":[63],"definitions":[64],"analysis":[66],"are":[67],"still":[68],"absent":[69],"in":[70,89,146],"literature.":[71],"In":[72,117],"work,":[74],"we":[75,83],"solve":[76],"this":[77],"systematically":[79],"completely.":[81],"Specifically,":[82],"formalize":[84],"definition":[86],"measure":[91],"space,":[92],"propose":[94],"general":[96],"\u2118-MinHash":[97],"sampling":[98],"algorithm":[99],"generates":[101],"samples":[102],"following":[103],"any":[104],"target":[105],"distribution,":[106],"preserves":[108],"between":[110],"two":[111],"distributions":[112],"by":[113],"hash":[115],"collision.":[116],"addition,":[118],"refined":[120],"early":[121],"stopping":[122],"rule":[123],"proposed":[125],"under":[126],"practical":[128],"boundedness":[129],"assumption.":[130],"We":[131],"validate":[132],"theory":[134],"through":[135],"simulation":[136],"experiments,":[138],"demonstrate":[140],"application":[142],"method":[145],"machine":[147],"learning":[148],"problems.":[149]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
