{"id":"https://openalex.org/W4406458108","doi":"https://doi.org/10.1109/bigdata62323.2024.10825041","title":"The Soundex Blocking: A Novel Blocking Approach for Record Linkage","display_name":"The Soundex Blocking: A Novel Blocking Approach for Record Linkage","publication_year":2024,"publication_date":"2024-12-15","ids":{"openalex":"https://openalex.org/W4406458108","doi":"https://doi.org/10.1109/bigdata62323.2024.10825041"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata62323.2024.10825041","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata62323.2024.10825041","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Big Data (BigData)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102561590","display_name":"Nidhibahen Shah","orcid":null},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Nidhibahen Shah","raw_affiliation_strings":["University of Connecticut,USA"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,USA","institution_ids":["https://openalex.org/I140172145"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079036569","display_name":"Ahmed Soliman","orcid":"https://orcid.org/0000-0002-4339-1385"},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ahmed Soliman","raw_affiliation_strings":["University of Connecticut,USA"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,USA","institution_ids":["https://openalex.org/I140172145"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104232356","display_name":"Joyanta Basak","orcid":null},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Joyanta Basak","raw_affiliation_strings":["University of Connecticut,USA"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,USA","institution_ids":["https://openalex.org/I140172145"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070824596","display_name":"Sartaj Sahni","orcid":"https://orcid.org/0000-0002-8129-1676"},"institutions":[{"id":"https://openalex.org/I33213144","display_name":"University of Florida","ror":"https://ror.org/02y3ad647","country_code":"US","type":"education","lineage":["https://openalex.org/I33213144"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sartaj Sahni","raw_affiliation_strings":["University of Florida,Gainesville,Florida"],"affiliations":[{"raw_affiliation_string":"University of Florida,Gainesville,Florida","institution_ids":["https://openalex.org/I33213144"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090815218","display_name":"Kenneth B. Haase","orcid":null},"institutions":[{"id":"https://openalex.org/I1333512998","display_name":"United States Census Bureau","ror":"https://ror.org/01qn7cs15","country_code":"US","type":"funder","lineage":["https://openalex.org/I1333512998","https://openalex.org/I1343035065"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kenneth Haase","raw_affiliation_strings":["U.S. Census Bureau,Washington DC,USA"],"affiliations":[{"raw_affiliation_string":"U.S. Census Bureau,Washington DC,USA","institution_ids":["https://openalex.org/I1333512998"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015264944","display_name":"Anup Mathur","orcid":null},"institutions":[{"id":"https://openalex.org/I1333512998","display_name":"United States Census Bureau","ror":"https://ror.org/01qn7cs15","country_code":"US","type":"funder","lineage":["https://openalex.org/I1333512998","https://openalex.org/I1343035065"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anup Mathur","raw_affiliation_strings":["U.S. Census Bureau,Washington DC,USA"],"affiliations":[{"raw_affiliation_string":"U.S. Census Bureau,Washington DC,USA","institution_ids":["https://openalex.org/I1333512998"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009015372","display_name":"K. Park","orcid":null},"institutions":[{"id":"https://openalex.org/I1333512998","display_name":"United States Census Bureau","ror":"https://ror.org/01qn7cs15","country_code":"US","type":"funder","lineage":["https://openalex.org/I1333512998","https://openalex.org/I1343035065"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Krista Park","raw_affiliation_strings":["U.S. Census Bureau,Washington DC,USA"],"affiliations":[{"raw_affiliation_string":"U.S. Census Bureau,Washington DC,USA","institution_ids":["https://openalex.org/I1333512998"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103083275","display_name":"Daniel H. Weinberg","orcid":"https://orcid.org/0000-0003-1873-8380"},"institutions":[{"id":"https://openalex.org/I1333512998","display_name":"United States Census Bureau","ror":"https://ror.org/01qn7cs15","country_code":"US","type":"funder","lineage":["https://openalex.org/I1333512998","https://openalex.org/I1343035065"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Daniel Weinberg","raw_affiliation_strings":["U.S. Census Bureau,Washington DC,USA"],"affiliations":[{"raw_affiliation_string":"U.S. Census Bureau,Washington DC,USA","institution_ids":["https://openalex.org/I1333512998"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114253641","display_name":"Jonathan White","orcid":null},"institutions":[{"id":"https://openalex.org/I1333512998","display_name":"United States Census Bureau","ror":"https://ror.org/01qn7cs15","country_code":"US","type":"funder","lineage":["https://openalex.org/I1333512998","https://openalex.org/I1343035065"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jonathan White","raw_affiliation_strings":["U.S. Census Bureau,Washington DC,USA"],"affiliations":[{"raw_affiliation_string":"U.S. Census Bureau,Washington DC,USA","institution_ids":["https://openalex.org/I1333512998"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5034177039","display_name":"Sanguthevar Rajasekaran","orcid":null},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sanguthevar Rajasekaran","raw_affiliation_strings":["University of Connecticut,USA"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,USA","institution_ids":["https://openalex.org/I140172145"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5102561590"],"corresponding_institution_ids":["https://openalex.org/I140172145"],"apc_list":null,"apc_paid":null,"fwci":1.0297,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.81403947,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"4039","last_page":"4047"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11614","display_name":"Cloud Data Security Solutions","score":0.9736999869346619,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10927","display_name":"Access Control and Trust","score":0.9337000250816345,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/blocking","display_name":"Blocking (statistics)","score":0.9048625230789185},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5861416459083557},{"id":"https://openalex.org/keywords/linkage","display_name":"Linkage (software)","score":0.42897775769233704},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.2788892984390259},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.06806662678718567},{"id":"https://openalex.org/keywords/genetics","display_name":"Genetics","score":0.05006355047225952}],"concepts":[{"id":"https://openalex.org/C144745244","wikidata":"https://www.wikidata.org/wiki/Q4927286","display_name":"Blocking (statistics)","level":2,"score":0.9048625230789185},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5861416459083557},{"id":"https://openalex.org/C31266012","wikidata":"https://www.wikidata.org/wiki/Q6554340","display_name":"Linkage (software)","level":3,"score":0.42897775769233704},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2788892984390259},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.06806662678718567},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.05006355047225952},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bigdata62323.2024.10825041","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata62323.2024.10825041","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Big Data (BigData)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320308380","display_name":"Yale University","ror":"https://ror.org/03v76x132"},{"id":"https://openalex.org/F4320331525","display_name":"U.S. Census Bureau","ror":"https://ror.org/01qn7cs15"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":15,"referenced_works":["https://openalex.org/W2008377649","https://openalex.org/W2053870252","https://openalex.org/W2063678733","https://openalex.org/W2127787701","https://openalex.org/W2140926830","https://openalex.org/W2223011434","https://openalex.org/W2343236305","https://openalex.org/W2805602976","https://openalex.org/W2908287046","https://openalex.org/W3138971549","https://openalex.org/W4229452932","https://openalex.org/W4230502578","https://openalex.org/W4391094703","https://openalex.org/W6639953478","https://openalex.org/W6718026493"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2392835431","https://openalex.org/W2126932387","https://openalex.org/W1965371215","https://openalex.org/W2353762239","https://openalex.org/W66314852","https://openalex.org/W2185938410","https://openalex.org/W2484966135"],"abstract_inverted_index":{"The":[0,199],"problem":[1],"of":[2,37,54,107,144],"record":[3,85,186],"linkage":[4,30,86,180,187,197],"is":[5,65,77,97,104],"to":[6,23,67,72,82,134,183,204],"cluster":[7,17],"the":[8,20,50,142,154,184],"records":[9,21],"from":[10,59],"several":[11],"data":[12],"sources":[13],"such":[14],"that":[15,96,164],"each":[16],"has":[18,31,119,150],"all":[19],"belonging":[22],"one":[24,27,148],"and":[25,47,172,195],"only":[26,123],"entity.":[28],"Record":[29],"applications":[32],"in":[33,153],"a":[34,78,93,105,125,138],"wide":[35],"variety":[36],"domains":[38],"including":[39],"public":[40],"health,":[41],"law":[42],"enforcement,":[43],"fraud":[44],"detection,":[45],"biology,":[46],"transportation.":[48],"Given":[49],"typically":[51],"vast":[52],"sizes":[53],"datasets,":[55],"existing":[56],"algorithms":[57,70],"suffer":[58],"very":[60],"long":[61],"runtimes.":[62],"Hence,":[63],"it":[64,152],"essential":[66],"develop":[68],"novel":[69,161],"tailored":[71],"address":[73],"this":[74,89,129],"issue.":[75],"Blocking":[76],"popular":[79],"technique":[80,95],"employed":[81],"speed":[83],"up":[84],"algorithms.":[87],"In":[88,128,156],"paper,":[90,130],"we":[91,131,158],"employ":[92],"blocking":[94,139,162,171,201],"based":[98,110],"on":[99,111],"Soundex":[100,102,118,136,166,170,175],"encoding.":[101],"index":[103],"method":[106],"coding":[108],"names":[109],"their":[112,116],"pronunciation":[113],"rather":[114],"than":[115],"spelling.":[117],"been":[120],"traditionally":[121],"used":[122],"as":[124,137,189],"distance":[126],"metric.":[127],"show":[132],"how":[133],"use":[135],"technique.":[140],"To":[141],"best":[143],"our":[145],"knowledge,":[146],"no":[147],"else":[149],"done":[151],"past.":[155],"fact,":[157],"introduce":[159],"two":[160],"approaches":[163,177,202],"utilize":[165],"encoding:":[167],"One":[168],"stage":[169,174],"Two":[173],"blocking.Our":[176],"exhibit":[178],"superior":[179],"performance":[181],"compared":[182],"state-of-the-art":[185],"algorithms,":[188],"evidenced":[190],"by":[191],"higher":[192],"F-1":[193],"scores":[194],"reduced":[196],"times.":[198],"proposed":[200],"prove":[203],"be":[205],"highly":[206],"effective.":[207]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
