{"id":"https://openalex.org/W2166400748","doi":"https://doi.org/10.14778/2732296.2732299","title":"String similarity joins","display_name":"String similarity joins","publication_year":2014,"publication_date":"2014-04-01","ids":{"openalex":"https://openalex.org/W2166400748","doi":"https://doi.org/10.14778/2732296.2732299","mag":"2166400748"},"language":"en","primary_location":{"id":"doi:10.14778/2732296.2732299","is_oa":false,"landing_page_url":"https://doi.org/10.14778/2732296.2732299","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060117799","display_name":"Yu Jiang","orcid":"https://orcid.org/0000-0003-0955-503X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yu Jiang","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100451576","display_name":"Guoliang Li","orcid":"https://orcid.org/0000-0002-1398-0621"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guoliang Li","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100951661","display_name":"Jianhua Feng","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianhua Feng","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067120879","display_name":"Wen-Syan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen-Syan Li","raw_affiliation_strings":["SAP Lab, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"SAP Lab, Shanghai, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5060117799"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":23.5375,"has_fulltext":false,"cited_by_count":134,"citation_normalized_percentile":{"value":0.99660188,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":100},"biblio":{"volume":"7","issue":"8","first_page":"625","last_page":"636"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9873999953269958,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9735999703407288,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/joins","display_name":"Joins","score":0.8444742560386658},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7189843654632568},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6855562925338745},{"id":"https://openalex.org/keywords/string","display_name":"String (physics)","score":0.6619868874549866},{"id":"https://openalex.org/keywords/string-metric","display_name":"String metric","score":0.6522970199584961},{"id":"https://openalex.org/keywords/join","display_name":"Join (topology)","score":0.6121538877487183},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.6086612939834595},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5505290031433105},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.46071135997772217},{"id":"https://openalex.org/keywords/string-searching-algorithm","display_name":"String searching algorithm","score":0.4453069269657135},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.43266797065734863},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3936312794685364},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.2820727825164795},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.24913331866264343},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1326654851436615},{"id":"https://openalex.org/keywords/pattern-matching","display_name":"Pattern matching","score":0.10646915435791016},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.06454271078109741}],"concepts":[{"id":"https://openalex.org/C2778692605","wikidata":"https://www.wikidata.org/wiki/Q4041866","display_name":"Joins","level":2,"score":0.8444742560386658},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7189843654632568},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6855562925338745},{"id":"https://openalex.org/C157486923","wikidata":"https://www.wikidata.org/wiki/Q1376436","display_name":"String (physics)","level":2,"score":0.6619868874549866},{"id":"https://openalex.org/C22820288","wikidata":"https://www.wikidata.org/wiki/Q9050568","display_name":"String metric","level":4,"score":0.6522970199584961},{"id":"https://openalex.org/C2776124973","wikidata":"https://www.wikidata.org/wiki/Q3183033","display_name":"Join (topology)","level":2,"score":0.6121538877487183},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.6086612939834595},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5505290031433105},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.46071135997772217},{"id":"https://openalex.org/C7757238","wikidata":"https://www.wikidata.org/wiki/Q374040","display_name":"String searching algorithm","level":3,"score":0.4453069269657135},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.43266797065734863},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3936312794685364},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2820727825164795},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.24913331866264343},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1326654851436615},{"id":"https://openalex.org/C68859911","wikidata":"https://www.wikidata.org/wiki/Q1503724","display_name":"Pattern matching","level":2,"score":0.10646915435791016},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.06454271078109741},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C37914503","wikidata":"https://www.wikidata.org/wiki/Q156495","display_name":"Mathematical physics","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/2732296.2732299","is_oa":false,"landing_page_url":"https://doi.org/10.14778/2732296.2732299","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320323893","display_name":"Fundo para o Desenvolvimento das Ci\u00eancias e da Tecnologia","ror":"https://ror.org/05vna4324"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1973001156","https://openalex.org/W1979666709","https://openalex.org/W1991175610","https://openalex.org/W2000482994","https://openalex.org/W2019412213","https://openalex.org/W2054693333","https://openalex.org/W2064926388","https://openalex.org/W2073329022","https://openalex.org/W2096598900","https://openalex.org/W2097184821","https://openalex.org/W2097776316","https://openalex.org/W2100548092","https://openalex.org/W2104599107","https://openalex.org/W2105436061","https://openalex.org/W2115214414","https://openalex.org/W2121269638","https://openalex.org/W2121516976","https://openalex.org/W2127675794","https://openalex.org/W2130825214","https://openalex.org/W2147033904","https://openalex.org/W2148148676","https://openalex.org/W2150916025","https://openalex.org/W2151930506","https://openalex.org/W2156855109","https://openalex.org/W2157092487","https://openalex.org/W2161936973","https://openalex.org/W2167847032","https://openalex.org/W6674576723","https://openalex.org/W6683401941"],"related_works":["https://openalex.org/W2393491644","https://openalex.org/W4206577045","https://openalex.org/W650102067","https://openalex.org/W3086237447","https://openalex.org/W2740404111","https://openalex.org/W1550806730","https://openalex.org/W4288309913","https://openalex.org/W4230921427","https://openalex.org/W2998410131","https://openalex.org/W2462428821"],"abstract_inverted_index":{"String":[0],"similarity":[1,97,145],"join":[2,98,146],"is":[3],"an":[4],"important":[5],"operation":[6],"in":[7,33,82],"data":[8],"integration":[9],"and":[10,110,134,141],"cleansing":[11],"that":[12],"finds":[13],"similar":[14],"string":[15,96],"pairs":[16],"from":[17,131],"two":[18,36],"collections":[19],"of":[20,94,119,143],"strings.":[21],"More":[22],"than":[23],"ten":[24],"algorithms":[25,40,54,71,147,155],"have":[26,41],"been":[27,43],"proposed":[28],"to":[29,68,152],"address":[30,79],"this":[31,80,83],"problem":[32],"the":[34,47,132,139],"recent":[35],"decades.":[37],"However,":[38],"existing":[39,95,144],"not":[42],"thoroughly":[44],"compared":[45],"under":[46],"same":[48],"experimental":[49],"framework.":[50],"For":[51],"example,":[52],"some":[53],"are":[55],"tested":[56],"only":[57],"on":[58,90,106,116],"specific":[59],"datasets.":[60],"This":[61],"makes":[62],"it":[63],"rather":[64],"difficult":[65],"for":[66,75,156],"practitioners":[67,151],"decide":[69],"which":[70,148],"should":[72],"be":[73],"used":[74],"various":[76,157],"scenarios.":[77,158],"To":[78],"problem,":[81],"paper":[84],"we":[85],"provide":[86,135],"a":[87,91,117],"comprehensive":[88,128],"survey":[89],"wide":[92],"spectrum":[93],"algorithms,":[99],"classify":[100],"them":[101,112],"into":[102],"different":[103,123],"categories":[104],"based":[105],"their":[107],"main":[108],"techniques,":[109],"compare":[111],"through":[113],"extensive":[114],"experiments":[115,133],"variety":[118],"real-world":[120],"datasets":[121],"with":[122],"characteristics.":[124],"We":[125],"also":[126],"report":[127],"findings":[129],"obtained":[130],"new":[136],"insights":[137],"about":[138],"strengths":[140],"weaknesses":[142],"can":[149],"guide":[150],"select":[153],"appropriate":[154]},"counts_by_year":[{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":9},{"year":2020,"cited_by_count":11},{"year":2019,"cited_by_count":22},{"year":2018,"cited_by_count":17},{"year":2017,"cited_by_count":25},{"year":2016,"cited_by_count":13},{"year":2015,"cited_by_count":17},{"year":2014,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
