{"id":"https://openalex.org/W2042458413","doi":"https://doi.org/10.1145/2396761.2398403","title":"An automatic blocking mechanism for large-scale de-duplication tasks","display_name":"An automatic blocking mechanism for large-scale de-duplication tasks","publication_year":2012,"publication_date":"2012-10-29","ids":{"openalex":"https://openalex.org/W2042458413","doi":"https://doi.org/10.1145/2396761.2398403","mag":"2042458413"},"language":"en","primary_location":{"id":"doi:10.1145/2396761.2398403","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2396761.2398403","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st ACM international conference on Information and knowledge management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114259175","display_name":"Anish Das Sarma","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Anish Das Sarma","raw_affiliation_strings":["Google Research, Mountain View, CA, USA","Google Research, Mountain View, CA, USA#TAB#"],"affiliations":[{"raw_affiliation_string":"Google Research, Mountain View, CA, USA","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Google Research, Mountain View, CA, USA#TAB#","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102717066","display_name":"Ankur Jain","orcid":"https://orcid.org/0000-0002-3246-7612"},"institutions":[{"id":"https://openalex.org/I4210134091","display_name":"Yahoo (United States)","ror":"https://ror.org/040dkzz12","country_code":"US","type":"company","lineage":["https://openalex.org/I4210134091"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ankur Jain","raw_affiliation_strings":["Yahoo! Research, Santa Clara, CA, USA","Yahoo! Research, Santa Clara , CA, USA"],"affiliations":[{"raw_affiliation_string":"Yahoo! Research, Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210134091"]},{"raw_affiliation_string":"Yahoo! Research, Santa Clara , CA, USA","institution_ids":["https://openalex.org/I4210134091"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018314576","display_name":"Ashwin Machanavajjhala","orcid":"https://orcid.org/0000-0003-1555-7330"},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ashwin Machanavajjhala","raw_affiliation_strings":["Duke University, Durham, NC, USA"],"affiliations":[{"raw_affiliation_string":"Duke University, Durham, NC, USA","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5029571794","display_name":"Philip Bohannon","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114444","display_name":"Meta (United States)","ror":"https://ror.org/01zbnvs85","country_code":"US","type":"company","lineage":["https://openalex.org/I4210114444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Philip Bohannon","raw_affiliation_strings":["Facebook, Palo Alto, CA, USA","Facebook, Palo Alto, CA., USA#TAB#"],"affiliations":[{"raw_affiliation_string":"Facebook, Palo Alto, CA, USA","institution_ids":["https://openalex.org/I4210114444"]},{"raw_affiliation_string":"Facebook, Palo Alto, CA., USA#TAB#","institution_ids":["https://openalex.org/I4210114444"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5114259175"],"corresponding_institution_ids":["https://openalex.org/I1291425158"],"apc_list":null,"apc_paid":null,"fwci":5.5983,"has_fulltext":false,"cited_by_count":40,"citation_normalized_percentile":{"value":0.95718261,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1055","last_page":"1064"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9861999750137329,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9743000268936157,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8034311532974243},{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.7604858875274658},{"id":"https://openalex.org/keywords/schema","display_name":"Schema (genetic algorithms)","score":0.6557671427726746},{"id":"https://openalex.org/keywords/hash-function","display_name":"Hash function","score":0.6013649702072144},{"id":"https://openalex.org/keywords/blocking","display_name":"Blocking (statistics)","score":0.5415390729904175},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.5347633957862854},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.47236543893814087},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3937278091907501},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.24705791473388672},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.240643709897995},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.08398011326789856}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8034311532974243},{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.7604858875274658},{"id":"https://openalex.org/C52146309","wikidata":"https://www.wikidata.org/wiki/Q7431116","display_name":"Schema (genetic algorithms)","level":2,"score":0.6557671427726746},{"id":"https://openalex.org/C99138194","wikidata":"https://www.wikidata.org/wiki/Q183427","display_name":"Hash function","level":2,"score":0.6013649702072144},{"id":"https://openalex.org/C144745244","wikidata":"https://www.wikidata.org/wiki/Q4927286","display_name":"Blocking (statistics)","level":2,"score":0.5415390729904175},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.5347633957862854},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.47236543893814087},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3937278091907501},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.24705791473388672},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.240643709897995},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.08398011326789856},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2396761.2398403","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2396761.2398403","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st ACM international conference on Information and knowledge management","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W36024057","https://openalex.org/W1540269031","https://openalex.org/W1646278814","https://openalex.org/W1787332437","https://openalex.org/W2007682403","https://openalex.org/W2024770506","https://openalex.org/W2031250218","https://openalex.org/W2034190452","https://openalex.org/W2036216970","https://openalex.org/W2052390074","https://openalex.org/W2060322444","https://openalex.org/W2061601738","https://openalex.org/W2067566391","https://openalex.org/W2073471108","https://openalex.org/W2102462631","https://openalex.org/W2103561765","https://openalex.org/W2105484782","https://openalex.org/W2108991785","https://openalex.org/W2111116800","https://openalex.org/W2112912553","https://openalex.org/W2117974736","https://openalex.org/W2123561513","https://openalex.org/W2129598390","https://openalex.org/W2151310484","https://openalex.org/W2151930506","https://openalex.org/W2170902582","https://openalex.org/W2171574281","https://openalex.org/W2295151155","https://openalex.org/W2307747718","https://openalex.org/W4254788633","https://openalex.org/W4388782171","https://openalex.org/W6632038546","https://openalex.org/W6665810511","https://openalex.org/W6669040500","https://openalex.org/W6676669657"],"related_works":["https://openalex.org/W3144870715","https://openalex.org/W3142319788","https://openalex.org/W2587188779","https://openalex.org/W3132870970","https://openalex.org/W4385804830","https://openalex.org/W2943088381","https://openalex.org/W2144348063","https://openalex.org/W2074021203","https://openalex.org/W4296125805","https://openalex.org/W1982579475"],"abstract_inverted_index":{"De-duplication":[0],"-":[1,12],"identification":[2],"of":[3,27,30,39,50,76,80,110],"distinct":[4],"records":[5],"referring":[6],"to":[7,89,99,105],"the":[8,25,41,74,84,87,143],"same":[9],"real-world":[10],"entity":[11],"is":[13,131],"a":[14,37,62,102,107,127,139,152],"well-known":[15],"challenge":[16],"in":[17,101,126],"data":[18,68,82,114],"integration.":[19],"Since":[20],"very":[21],"large":[22,108],"datasets":[23],"prohibit":[24],"comparison":[26],"every":[28],"pair":[29],"records,":[31],"blocking":[32],"has":[33],"been":[34],"identified":[35,51],"as":[36,66,138],"technique":[38],"dividing":[40],"dataset":[42],"for":[43,53],"pairwise":[44],"comparisons,":[45],"thereby":[46],"trading":[47],"off":[48],"recall":[49],"duplicates":[52],"efficiency.":[54],"Traditional":[55],"de-duplication":[56,96],"tasks,":[57],"while":[58,116],"challenging,":[59],"typically":[60],"involved":[61],"fixed":[63],"schema":[64],"such":[65],"Census":[67],"or":[69],"medical":[70],"records.":[71],"However,":[72],"with":[73],"presence":[75],"large,":[77],"diverse":[78],"sets":[79],"structured":[81],"on":[83,93],"web":[85],"and":[86,113],"need":[88,98],"organize":[90],"it":[91,130],"effectively":[92],"content":[94],"portals,":[95],"systems":[97],"scale":[100],"new":[103],"dimension":[104],"handle":[106],"number":[109],"schemas,":[111],"tasks":[112],"sets,":[115],"handling":[117],"ever":[118],"larger":[119],"problem":[120,146],"sizes.":[121],"In":[122],"addition,":[123],"when":[124],"working":[125],"map-reduce":[128],"framework":[129],"important":[132],"that":[133,154],"canopy":[134,144],"formation":[135],"be":[136],"implemented":[137],"hash":[140],"function,":[141],"making":[142],"design":[145],"more":[147],"challenging.":[148],"We":[149],"present":[150],"CBLOCK,":[151],"system":[153],"addresses":[155],"these":[156],"challenges.":[157]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":4},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":5},{"year":2016,"cited_by_count":5},{"year":2015,"cited_by_count":6},{"year":2014,"cited_by_count":4},{"year":2013,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
