{"id":"https://openalex.org/W3170190513","doi":"https://doi.org/10.1145/3448016.3452824","title":"Auto-FuzzyJoin","display_name":"Auto-FuzzyJoin","publication_year":2021,"publication_date":"2021-06-09","ids":{"openalex":"https://openalex.org/W3170190513","doi":"https://doi.org/10.1145/3448016.3452824","mag":"3170190513"},"language":"en","primary_location":{"id":"doi:10.1145/3448016.3452824","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3448016.3452824","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Management of Data","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100432795","display_name":"Peng Li","orcid":"https://orcid.org/0000-0003-4981-0496"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Peng Li","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062327704","display_name":"Xiang Cheng","orcid":"https://orcid.org/0000-0001-6556-2264"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiang Cheng","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100430982","display_name":"Xu Chu","orcid":"https://orcid.org/0009-0007-3202-3767"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xu Chu","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034908019","display_name":"Yeye He","orcid":"https://orcid.org/0000-0003-2824-5299"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yeye He","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5038037154","display_name":"Surajit Chaudhuri","orcid":"https://orcid.org/0000-0001-8252-5270"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Surajit Chaudhuri","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100432795"],"corresponding_institution_ids":["https://openalex.org/I130701444"],"apc_list":null,"apc_paid":null,"fwci":2.7884,"has_fulltext":false,"cited_by_count":23,"citation_normalized_percentile":{"value":0.90341033,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":93,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1064","last_page":"1076"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7933085560798645},{"id":"https://openalex.org/keywords/joins","display_name":"Joins","score":0.7402216792106628},{"id":"https://openalex.org/keywords/fuzzy-logic","display_name":"Fuzzy logic","score":0.6686960458755493},{"id":"https://openalex.org/keywords/join","display_name":"Join (topology)","score":0.6425102353096008},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.630828857421875},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4601595401763916},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.43889909982681274},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.38693878054618835}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7933085560798645},{"id":"https://openalex.org/C2778692605","wikidata":"https://www.wikidata.org/wiki/Q4041866","display_name":"Joins","level":2,"score":0.7402216792106628},{"id":"https://openalex.org/C58166","wikidata":"https://www.wikidata.org/wiki/Q224821","display_name":"Fuzzy logic","level":2,"score":0.6686960458755493},{"id":"https://openalex.org/C2776124973","wikidata":"https://www.wikidata.org/wiki/Q3183033","display_name":"Join (topology)","level":2,"score":0.6425102353096008},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.630828857421875},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4601595401763916},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.43889909982681274},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38693878054618835},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3448016.3452824","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3448016.3452824","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Management of Data","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W1552847225","https://openalex.org/W1721994796","https://openalex.org/W1746620543","https://openalex.org/W1799378028","https://openalex.org/W1870305865","https://openalex.org/W1973001156","https://openalex.org/W1981590391","https://openalex.org/W2021739502","https://openalex.org/W2065259291","https://openalex.org/W2073471108","https://openalex.org/W2093050254","https://openalex.org/W2097776316","https://openalex.org/W2104599107","https://openalex.org/W2105423800","https://openalex.org/W2111116800","https://openalex.org/W2121269638","https://openalex.org/W2121516976","https://openalex.org/W2128793454","https://openalex.org/W2132452995","https://openalex.org/W2151930506","https://openalex.org/W2260484439","https://openalex.org/W2270660075","https://openalex.org/W2286724461","https://openalex.org/W2295873867","https://openalex.org/W2399361902","https://openalex.org/W2492590231","https://openalex.org/W2542998387","https://openalex.org/W2616147950","https://openalex.org/W2791315675","https://openalex.org/W2798649495","https://openalex.org/W2946504770","https://openalex.org/W3029701880","https://openalex.org/W3106020963","https://openalex.org/W3135670153","https://openalex.org/W3146259567","https://openalex.org/W3208204807"],"related_works":["https://openalex.org/W2393491644","https://openalex.org/W4206577045","https://openalex.org/W3086237447","https://openalex.org/W650102067","https://openalex.org/W2740404111","https://openalex.org/W1550806730","https://openalex.org/W1496672428","https://openalex.org/W2589740103","https://openalex.org/W1966967794","https://openalex.org/W1501284171"],"abstract_inverted_index":{"Fuzzy":[0],"similarity":[1],"join":[2],"is":[3,153,209],"an":[4,99],"important":[5],"database":[6],"operator":[7],"widely":[8],"used":[9,226],"in":[10,59,167],"practice.":[11],"So":[12],"far":[13],"the":[14,86,146,151,163,175,200],"research":[15],"community":[16],"has":[17,74],"focused":[18],"exclusively":[19],"on":[20,109,182,237],"optimizing":[21],"fuzzy":[22],"joinscalability.":[23],"However,":[24],"practitioners":[25],"today":[26],"also":[27],"struggle":[28],"to":[29,48,50,55,61,128,157,239],"optimize":[30,62],"fuzzy-joinquality,":[31],"because":[32],"they":[33],"face":[34],"a":[35,51,92,137,154,187],"daunting":[36],"space":[37],"of":[38,68,88,95,150,177,222],"parameters":[39,58],"(e.g.,":[40,216],"distance-functions,":[41,96],"distance-thresholds,":[42],"tokenization-options,":[43],"etc.),":[44],"and":[45,134,136,186,208,218,235],"often":[46],"have":[47,231],"resort":[49],"manual":[52],"trial-and-error":[53],"approach":[54],"program":[56,159],"these":[57],"order":[60],"fuzzy-join":[63,72,107,171,192],"quality.":[64],"This":[65],"key":[66],"challenge":[67],"automatically":[69,158],"generating":[70],"high-quality":[71],"programs":[73,108],"received":[75],"surprisingly":[76,210],"little":[77],"attention":[78],"thus":[79],"far.":[80],"In":[81],"this":[82],"work,":[83],"we":[84,97],"study":[85],"problem":[87],"\"auto-program''":[89],"fuzzy-joins.":[90],"Leveraging":[91],"geometric":[93],"interpretation":[94],"develop":[98],"unsupervised":[100,206],"Auto-FuzzyJoin":[101,144,202],"framework":[102],"that":[103,148,161,199],"can":[104],"infer":[105],"suitable":[106],"given":[110],"input":[111,117,131,152],"tables,":[112],"without":[113],"requiring":[114],"explicit":[115],"human":[116],"such":[118],"as":[119,174,227],"labelled":[120],"training":[121,228],"data.":[122,229],"Using":[123],"Auto-FuzzyJoin,":[124],"users":[125],"only":[126],"need":[127],"provide":[129],"two":[130],"tables":[132],"L":[133],"R,":[135],"desired":[138],"precision":[139,164],"target":[140,165],"\u03c4":[141,166],"(say":[142],"0.9).":[143],"leverages":[145],"fact":[147],"one":[149],"reference":[155],"table":[156],"fuzzy-joins":[160],"meet":[162],"expectation,":[168],"while":[169],"maximizing":[170],"recall":[172],"(defined":[173],"number":[176],"correctly":[178],"joined":[179],"records).":[180],"Experiments":[181],"both":[183],"existing":[184,205],"benchmarks":[185],"new":[188],"benchmark":[189,236],"with":[190],"50":[191],"tasks":[193],"created":[194],"from":[195],"Wikipedia":[196],"data":[197],"suggest":[198],"proposed":[201],"significantly":[203],"outperforms":[204],"approaches,":[207],"competitive":[211],"even":[212],"against":[213],"supervised":[214],"approaches":[215],"Magellan":[217],"DeepMatcher)":[219],"when":[220],"50%":[221],"ground-truth":[223],"labels":[224],"are":[225],"We":[230],"released":[232],"our":[233],"code":[234],"GitHub\\footnote\\urlhttps://github.com/chu-data-lab/AutomaticFuzzyJoin":[238],"facilitate":[240],"future":[241],"research.":[242]},"counts_by_year":[{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2021-06-22T00:00:00"}
