{"id":"https://openalex.org/W4380433179","doi":"https://doi.org/10.1145/3588929","title":"Discovering Similarity Inclusion Dependencies","display_name":"Discovering Similarity Inclusion Dependencies","publication_year":2023,"publication_date":"2023-05-26","ids":{"openalex":"https://openalex.org/W4380433179","doi":"https://doi.org/10.1145/3588929"},"language":"en","primary_location":{"id":"doi:10.1145/3588929","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3588929","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3588929","source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3588929","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001444081","display_name":"Youri Kaminsky","orcid":"https://orcid.org/0009-0007-6547-592X"},"institutions":[{"id":"https://openalex.org/I143288331","display_name":"Hasso Plattner Institute","ror":"https://ror.org/058rn5r42","country_code":"DE","type":"facility","lineage":["https://openalex.org/I143288331","https://openalex.org/I176453806"]},{"id":"https://openalex.org/I176453806","display_name":"University of Potsdam","ror":"https://ror.org/03bnmw459","country_code":"DE","type":"education","lineage":["https://openalex.org/I176453806"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Youri Kaminsky","raw_affiliation_strings":["Hasso Plattner Institute, University of Potsdam, Potsdam, Germany"],"raw_orcid":"https://orcid.org/0009-0007-6547-592X","affiliations":[{"raw_affiliation_string":"Hasso Plattner Institute, University of Potsdam, Potsdam, Germany","institution_ids":["https://openalex.org/I143288331","https://openalex.org/I176453806"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065023756","display_name":"Eduardo H. M. Pena","orcid":"https://orcid.org/0000-0002-4852-3113"},"institutions":[{"id":"https://openalex.org/I1283613182","display_name":"Universidade Tecnol\u00f3gica Federal do Paran\u00e1","ror":"https://ror.org/002v2kq79","country_code":"BR","type":"education","lineage":["https://openalex.org/I1283613182"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Eduardo H. M. Pena","raw_affiliation_strings":["Federal University of Technology - Paran\u00e1, Campo Mour\u00e3o, Brazil"],"raw_orcid":"https://orcid.org/0000-0002-4852-3113","affiliations":[{"raw_affiliation_string":"Federal University of Technology - Paran\u00e1, Campo Mour\u00e3o, Brazil","institution_ids":["https://openalex.org/I1283613182"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053028480","display_name":"Felix Naumann","orcid":"https://orcid.org/0000-0002-4483-1389"},"institutions":[{"id":"https://openalex.org/I143288331","display_name":"Hasso Plattner Institute","ror":"https://ror.org/058rn5r42","country_code":"DE","type":"facility","lineage":["https://openalex.org/I143288331","https://openalex.org/I176453806"]},{"id":"https://openalex.org/I176453806","display_name":"University of Potsdam","ror":"https://ror.org/03bnmw459","country_code":"DE","type":"education","lineage":["https://openalex.org/I176453806"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Felix Naumann","raw_affiliation_strings":["Hasso Plattner Institute, University of Potsdam, Potsdam, Germany"],"raw_orcid":"https://orcid.org/0000-0002-4483-1389","affiliations":[{"raw_affiliation_string":"Hasso Plattner Institute, University of Potsdam, Potsdam, Germany","institution_ids":["https://openalex.org/I143288331","https://openalex.org/I176453806"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5001444081"],"corresponding_institution_ids":["https://openalex.org/I143288331","https://openalex.org/I176453806"],"apc_list":null,"apc_paid":null,"fwci":3.3573,"has_fulltext":true,"cited_by_count":14,"citation_normalized_percentile":{"value":0.92436477,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":"1","issue":"1","first_page":"1","last_page":"24"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.982200026512146,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9750000238418579,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/joins","display_name":"Joins","score":0.6992638111114502},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6721049547195435},{"id":"https://openalex.org/keywords/functional-dependency","display_name":"Functional dependency","score":0.6321688294410706},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.606168806552887},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5793702006340027},{"id":"https://openalex.org/keywords/dependency","display_name":"Dependency (UML)","score":0.5595584511756897},{"id":"https://openalex.org/keywords/relaxation","display_name":"Relaxation (psychology)","score":0.419050008058548},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3671402037143707},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.33927929401397705},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.23101583123207092},{"id":"https://openalex.org/keywords/relational-database","display_name":"Relational database","score":0.17297953367233276},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.13220089673995972},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.0785931646823883}],"concepts":[{"id":"https://openalex.org/C2778692605","wikidata":"https://www.wikidata.org/wiki/Q4041866","display_name":"Joins","level":2,"score":0.6992638111114502},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6721049547195435},{"id":"https://openalex.org/C26320393","wikidata":"https://www.wikidata.org/wiki/Q597053","display_name":"Functional dependency","level":3,"score":0.6321688294410706},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.606168806552887},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5793702006340027},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.5595584511756897},{"id":"https://openalex.org/C2776029896","wikidata":"https://www.wikidata.org/wiki/Q3935810","display_name":"Relaxation (psychology)","level":2,"score":0.419050008058548},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3671402037143707},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.33927929401397705},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.23101583123207092},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.17297953367233276},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.13220089673995972},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0785931646823883},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.0},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3588929","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3588929","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3588929","source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3588929","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3588929","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3588929","source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.7599999904632568}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4380433179.pdf","grobid_xml":"https://content.openalex.org/works/W4380433179.grobid-xml"},"referenced_works_count":20,"referenced_works":["https://openalex.org/W1549531352","https://openalex.org/W1819449330","https://openalex.org/W1970026646","https://openalex.org/W2059009730","https://openalex.org/W2067494918","https://openalex.org/W2073112656","https://openalex.org/W2089966454","https://openalex.org/W2099725016","https://openalex.org/W2125552274","https://openalex.org/W2140313762","https://openalex.org/W2162294668","https://openalex.org/W2182148130","https://openalex.org/W2186686397","https://openalex.org/W2232417456","https://openalex.org/W2341748398","https://openalex.org/W2741470040","https://openalex.org/W2910285766","https://openalex.org/W2988782038","https://openalex.org/W3040722616","https://openalex.org/W3083028011"],"related_works":["https://openalex.org/W2392606101","https://openalex.org/W3162070149","https://openalex.org/W2072918301","https://openalex.org/W2133756937","https://openalex.org/W2385315033","https://openalex.org/W2362842011","https://openalex.org/W1845544376","https://openalex.org/W1839867872","https://openalex.org/W2048090520","https://openalex.org/W2166449856"],"abstract_inverted_index":{"Inclusion":[0],"dependencies":[1,52,106,134,164],"(INDs)":[2],"are":[3,17],"a":[4,81,166,185,201,204],"well-known":[5],"type":[6],"of":[7,14,21,43,128,150,177,206],"data":[8,57,88],"dependency,":[9],"specifying":[10],"that":[11,197],"the":[12,51,77,121,125,148,156,175],"values":[13],"one":[15],"column":[16],"contained":[18],"in":[19,147,165],"those":[20],"another":[22],"column.":[23],"INDs":[24,44,64,179],"can":[25,135,199],"be":[26],"used":[27],"for":[28,86,101,104,174],"various":[29],"purposes,":[30],"such":[31,90,108,141],"as":[32,91,142],"foreign-key":[33,143],"candidate":[34,144,191],"selection":[35],"or":[36,93],"join":[37],"partner":[38],"discovery.":[39],"The":[40],"traditional":[41,137,178],"notion":[42],"is":[45,74,99],"based":[46],"on":[47],"clean":[48],"data,":[49],"where":[50],"hold":[53],"without":[54],"exceptions.":[55],"Unfortunately,":[56],"often":[58],"contain":[59],"errors,":[60,89],"preventing":[61],"otherwise":[62],"valid":[63],"from":[65],"being":[66],"discovered.":[67],"A":[68],"typical":[69],"response":[70],"to":[71,75,84,159,208],"this":[72,97],"problem":[73],"relax":[76,120],"dependency":[78],"definition":[79],"using":[80],"similarity":[82,116,162,182],"measure":[83],"account":[85],"minor":[87],"typos":[92],"different":[94],"formatting.":[95],"While":[96],"relaxation":[98,109],"known":[100],"functional":[102],"dependencies,":[103,118],"inclusion":[105,117,122,133,163],"no":[107],"has":[110],"been":[111],"defined.":[112],"We":[113,153],"formally":[114],"introduce":[115],"which":[119],"by":[123,203],"demanding":[124],"existence":[126],"only":[127],"sufficiently":[129],"similar":[130],"values.":[131],"Similarity":[132],"fulfill":[136],"IND":[138],"use":[139],"cases,":[140],"discovery,":[145],"even":[146],"presence":[149],"dirty":[151],"data.":[152],"present":[154],"Sawfish,":[155],"first":[157],"algorithm":[158,171],"discover":[160],"all":[161],"given":[167],"dataset":[168],"efficiently.":[169],"Our":[170,193],"combines":[172],"approaches":[173],"discovery":[176],"and":[180,189],"string":[181],"joins":[183],"with":[184],"novel":[186],"sliding-window":[187],"approach":[188],"lazy":[190],"validation.":[192],"experimental":[194],"evaluation":[195],"shows":[196],"Sawfish":[198],"outperform":[200],"baseline":[202],"factor":[205],"up":[207],"6.5.":[209]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":3}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
