{"id":"https://openalex.org/W4366729173","doi":"https://doi.org/10.14778/3583140.3583163","title":"Sparkly: A Simple yet Surprisingly Strong TF/IDF Blocker for Entity Matching","display_name":"Sparkly: A Simple yet Surprisingly Strong TF/IDF Blocker for Entity Matching","publication_year":2023,"publication_date":"2023-02-01","ids":{"openalex":"https://openalex.org/W4366729173","doi":"https://doi.org/10.14778/3583140.3583163"},"language":"en","primary_location":{"id":"doi:10.14778/3583140.3583163","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3583140.3583163","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5004171621","display_name":"Derek J. Paulsen","orcid":null},"institutions":[{"id":"https://openalex.org/I135310074","display_name":"University of Wisconsin\u2013Madison","ror":"https://ror.org/01y2jtd41","country_code":"US","type":"education","lineage":["https://openalex.org/I135310074"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Derek Paulsen","raw_affiliation_strings":["University of Wisconsin-Madison and Informatica Inc"],"affiliations":[{"raw_affiliation_string":"University of Wisconsin-Madison and Informatica Inc","institution_ids":["https://openalex.org/I135310074"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088781274","display_name":"Yash Govind","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yash Govind","raw_affiliation_strings":["Apple Inc"],"affiliations":[{"raw_affiliation_string":"Apple Inc","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110256670","display_name":"AnHai Doan","orcid":null},"institutions":[{"id":"https://openalex.org/I135310074","display_name":"University of Wisconsin\u2013Madison","ror":"https://ror.org/01y2jtd41","country_code":"US","type":"education","lineage":["https://openalex.org/I135310074"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"AnHai Doan","raw_affiliation_strings":["University of Wisconsin-Madison and Informatica Inc"],"affiliations":[{"raw_affiliation_string":"University of Wisconsin-Madison and Informatica Inc","institution_ids":["https://openalex.org/I135310074"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5004171621"],"corresponding_institution_ids":["https://openalex.org/I135310074"],"apc_list":null,"apc_paid":null,"fwci":7.1323,"has_fulltext":false,"cited_by_count":28,"citation_normalized_percentile":{"value":0.97189483,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":100},"biblio":{"volume":"16","issue":"6","first_page":"1507","last_page":"1519"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.9789000153541565,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9710000157356262,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/blocking","display_name":"Blocking (statistics)","score":0.8234900236129761},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.818668007850647},{"id":"https://openalex.org/keywords/predictability","display_name":"Predictability","score":0.5500855445861816},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5056424140930176},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.49289417266845703},{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.4914294481277466},{"id":"https://openalex.org/keywords/nothing","display_name":"Nothing","score":0.47306573390960693},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.4543340802192688},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4215329885482788},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.36115482449531555},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.1131872832775116},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.09901461005210876},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.08240580558776855},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.07223352789878845}],"concepts":[{"id":"https://openalex.org/C144745244","wikidata":"https://www.wikidata.org/wiki/Q4927286","display_name":"Blocking (statistics)","level":2,"score":0.8234900236129761},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.818668007850647},{"id":"https://openalex.org/C197640229","wikidata":"https://www.wikidata.org/wiki/Q2534066","display_name":"Predictability","level":2,"score":0.5500855445861816},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5056424140930176},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.49289417266845703},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.4914294481277466},{"id":"https://openalex.org/C136815107","wikidata":"https://www.wikidata.org/wiki/Q154242","display_name":"Nothing","level":2,"score":0.47306573390960693},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.4543340802192688},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4215329885482788},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.36115482449531555},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.1131872832775116},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.09901461005210876},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.08240580558776855},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.07223352789878845},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3583140.3583163","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3583140.3583163","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5199999809265137,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W67656603","https://openalex.org/W1980344365","https://openalex.org/W2031250218","https://openalex.org/W2063694594","https://openalex.org/W2105423800","https://openalex.org/W2105436061","https://openalex.org/W2107966677","https://openalex.org/W2112101330","https://openalex.org/W2546672044","https://openalex.org/W2612526608","https://openalex.org/W2791946429","https://openalex.org/W2798649495","https://openalex.org/W2889284307","https://openalex.org/W2945883855","https://openalex.org/W2948709946","https://openalex.org/W2957204582","https://openalex.org/W3011807731","https://openalex.org/W3014295153","https://openalex.org/W3014705052","https://openalex.org/W3015786360","https://openalex.org/W3029269967","https://openalex.org/W3092962901","https://openalex.org/W3123375411","https://openalex.org/W3134767663","https://openalex.org/W3170190513","https://openalex.org/W3186756857","https://openalex.org/W3197468999","https://openalex.org/W4211193898","https://openalex.org/W4213009331","https://openalex.org/W4242744113","https://openalex.org/W4252684946","https://openalex.org/W4385270265","https://openalex.org/W6679046037"],"related_works":["https://openalex.org/W2216420239","https://openalex.org/W3011492772","https://openalex.org/W2499122376","https://openalex.org/W2800570524","https://openalex.org/W2808813869","https://openalex.org/W2726467123","https://openalex.org/W2064726690","https://openalex.org/W4252678288","https://openalex.org/W2109915140","https://openalex.org/W2315519183"],"abstract_inverted_index":{"Blocking":[0],"is":[1],"a":[2,69,74,136,161],"major":[3],"task":[4],"in":[5,48,55,68],"entity":[6],"matching.":[7],"Numerous":[8],"blocking":[9,21,38,54,67,129,141,149],"solutions":[10],"have":[11],"been":[12],"developed,":[13],"but":[14],"as":[15,17],"far":[16],"we":[18,34,41,51,109],"can":[19,87],"tell,":[20],"using":[22,39],"the":[23],"well-known":[24],"tf/idf":[25,37,53,66,128],"measure":[26],"has":[27],"received":[28],"virtually":[29],"no":[30],"attention.":[31],"Yet,":[32],"when":[33],"experimented":[35],"with":[36],"Lucene,":[40],"found":[42],"it":[43],"did":[44],"quite":[45],"well.":[46],"So":[47],"this":[49],"paper":[50],"examine":[52],"depth.":[56],"We":[57,77,97],"develop":[58,78],"Sparkly,":[59],"which":[60,156,165],"uses":[61],"Lucene":[62],"to":[63,80,90],"perform":[64,98],"top-k":[65,154],"distributed":[70,162],"share-nothing":[71,163],"fashion":[72],"on":[73],"Spark":[75],"cluster.":[76],"techniques":[79],"identify":[81],"good":[82],"attributes":[83],"and":[84,121,146,160,170],"tokenizers":[85],"that":[86,102,126,139],"be":[88],"used":[89],"block":[91],"on,":[92],"making":[93],"Sparkly":[94,103,134],"completely":[95],"automatic.":[96],"extensive":[99],"experiments":[100],"showing":[101],"outperforms":[104],"8":[105],"state-of-the-art":[106],"blockers.":[107],"Finally,":[108],"provide":[110],"an":[111],"in-depth":[112],"analysis":[113],"of":[114],"Sparkly's":[115],"performance,":[116],"regarding":[117],"both":[118],"recall/output":[119],"size":[120],"runtime.":[122],"Our":[123],"findings":[124],"suggest":[125],"(a)":[127],"needs":[130],"more":[131],"attention,":[132],"(b)":[133],"forms":[135],"strong":[137],"baseline":[138],"future":[140,148],"work":[142,150],"should":[143,151],"compare":[144],"against,":[145],"(c)":[147],"seriously":[152],"consider":[153],"blocking,":[155],"helps":[157,166],"improve":[158,167],"recall,":[159],"architecture,":[164],"scalability,":[168],"predictability,":[169],"extensibility.":[171]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":14},{"year":2024,"cited_by_count":10},{"year":2023,"cited_by_count":1}],"updated_date":"2026-02-25T08:12:03.925757","created_date":"2025-10-10T00:00:00"}
