{"id":"https://openalex.org/W2963784432","doi":"https://doi.org/10.1109/bigdata47090.2019.9006095","title":"Fast Record Linkage for Company Entities","display_name":"Fast Record Linkage for Company Entities","publication_year":2019,"publication_date":"2019-12-01","ids":{"openalex":"https://openalex.org/W2963784432","doi":"https://doi.org/10.1109/bigdata47090.2019.9006095","mag":"2963784432"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata47090.2019.9006095","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata47090.2019.9006095","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1907.08667","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5076293356","display_name":"Thomas Gschwind","orcid":"https://orcid.org/0000-0003-0212-4800"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Thomas Gschwind","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM Res.-Zurich, Ruschlikon, Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM Res.-Zurich, Ruschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012998759","display_name":"Christoph Miksovic","orcid":"https://orcid.org/0000-0002-7846-3668"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Christoph Miksovic","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM Res.-Zurich, Ruschlikon, Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM Res.-Zurich, Ruschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040326345","display_name":"Julian Minder","orcid":null},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Julian Minder","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM Res.-Zurich, Ruschlikon, Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM Res.-Zurich, Ruschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082535778","display_name":"\u041a\u0430\u0446\u044f\u0440\u044b\u043d\u0430 \u041c\u0438\u0440\u044b\u043b\u0435\u043d\u043a\u0430","orcid":"https://orcid.org/0000-0002-1614-6835"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Katsiaryna Mirylenka","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM Res.-Zurich, Ruschlikon, Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM Res.-Zurich, Ruschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101547078","display_name":"Paolo Scotton","orcid":"https://orcid.org/0000-0003-4737-0108"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Paolo Scotton","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.6175,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.73135198,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"623","last_page":"630"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9539999961853027,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9416000247001648,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7470270395278931},{"id":"https://openalex.org/keywords/record-linkage","display_name":"Record linkage","score":0.704459011554718},{"id":"https://openalex.org/keywords/linkage","display_name":"Linkage (software)","score":0.6971012949943542},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.614661455154419},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5630542039871216},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5618675947189331},{"id":"https://openalex.org/keywords/analytics","display_name":"Analytics","score":0.5542473196983337},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5113279223442078},{"id":"https://openalex.org/keywords/master-data","display_name":"Master data","score":0.4999868869781494},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.49794483184814453},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.49543702602386475},{"id":"https://openalex.org/keywords/data-integration","display_name":"Data integration","score":0.4270249605178833},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.41948214173316956},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4135342836380005},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3792048394680023},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.2318369746208191}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7470270395278931},{"id":"https://openalex.org/C142210648","wikidata":"https://www.wikidata.org/wiki/Q1266546","display_name":"Record linkage","level":3,"score":0.704459011554718},{"id":"https://openalex.org/C31266012","wikidata":"https://www.wikidata.org/wiki/Q6554340","display_name":"Linkage (software)","level":3,"score":0.6971012949943542},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.614661455154419},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5630542039871216},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5618675947189331},{"id":"https://openalex.org/C79158427","wikidata":"https://www.wikidata.org/wiki/Q485396","display_name":"Analytics","level":2,"score":0.5542473196983337},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5113279223442078},{"id":"https://openalex.org/C61871575","wikidata":"https://www.wikidata.org/wiki/Q384093","display_name":"Master data","level":2,"score":0.4999868869781494},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.49794483184814453},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.49543702602386475},{"id":"https://openalex.org/C72634772","wikidata":"https://www.wikidata.org/wiki/Q386824","display_name":"Data integration","level":2,"score":0.4270249605178833},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.41948214173316956},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4135342836380005},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3792048394680023},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.2318369746208191},{"id":"https://openalex.org/C2908647359","wikidata":"https://www.wikidata.org/wiki/Q2625603","display_name":"Population","level":2,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C149923435","wikidata":"https://www.wikidata.org/wiki/Q37732","display_name":"Demography","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/bigdata47090.2019.9006095","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata47090.2019.9006095","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1907.08667","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1907.08667","pdf_url":"https://arxiv.org/pdf/1907.08667","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:2963784432","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/1907.08667","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.1907.08667","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1907.08667","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1907.08667","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1907.08667","pdf_url":"https://arxiv.org/pdf/1907.08667","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.6200000047683716,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2963784432.pdf","grobid_xml":"https://content.openalex.org/works/W2963784432.grobid-xml"},"referenced_works_count":39,"referenced_works":["https://openalex.org/W36024057","https://openalex.org/W102708294","https://openalex.org/W1550430455","https://openalex.org/W1736726159","https://openalex.org/W1940872118","https://openalex.org/W1991800036","https://openalex.org/W1992346944","https://openalex.org/W2012833704","https://openalex.org/W2041439319","https://openalex.org/W2073471108","https://openalex.org/W2097263486","https://openalex.org/W2127050919","https://openalex.org/W2132069633","https://openalex.org/W2143996849","https://openalex.org/W2147880316","https://openalex.org/W2397770138","https://openalex.org/W2544062944","https://openalex.org/W2546672044","https://openalex.org/W2605305417","https://openalex.org/W2702570413","https://openalex.org/W2767681556","https://openalex.org/W2798649495","https://openalex.org/W2809779196","https://openalex.org/W2925156238","https://openalex.org/W2932842219","https://openalex.org/W3013357708","https://openalex.org/W4230502578","https://openalex.org/W4230940751","https://openalex.org/W4300601563","https://openalex.org/W6601540176","https://openalex.org/W6604189946","https://openalex.org/W6640362995","https://openalex.org/W6678771216","https://openalex.org/W6679663036","https://openalex.org/W6682082992","https://openalex.org/W6735896536","https://openalex.org/W6752790590","https://openalex.org/W6761276267","https://openalex.org/W6775149449"],"related_works":["https://openalex.org/W2892179619","https://openalex.org/W2922276810","https://openalex.org/W1981578383","https://openalex.org/W1568445463","https://openalex.org/W1547612978","https://openalex.org/W2143683307","https://openalex.org/W2112263062","https://openalex.org/W2187801565","https://openalex.org/W3210445714","https://openalex.org/W3208996364","https://openalex.org/W3194927518","https://openalex.org/W2013909137","https://openalex.org/W2942536483","https://openalex.org/W3109346489","https://openalex.org/W3134521310","https://openalex.org/W3121361745","https://openalex.org/W109373635","https://openalex.org/W2593617781","https://openalex.org/W2113855145","https://openalex.org/W2186829629"],"abstract_inverted_index":{"Record":[0],"linkage":[1,80],"is":[2,25,70,97],"an":[3,101],"essential":[4],"part":[5],"of":[6,104,124,138],"nearly":[7],"all":[8],"real-world":[9,112],"systems":[10],"that":[11,77,118],"consume":[12],"structured":[13],"and":[14,43,62],"unstructured":[15],"data":[16,31,41],"coming":[17],"from":[18],"different":[19],"sources.":[20],"Typically":[21],"no":[22],"common":[23],"key":[24],"available":[26],"for":[27,91,129],"connecting":[28],"records.":[29],"Massive":[30],"integration":[32],"processes":[33],"often":[34],"have":[35],"to":[36,89,127],"be":[37,47],"completed":[38],"before":[39],"any":[40],"analytics":[42],"further":[44],"processing":[45],"can":[46],"performed.":[48],"In":[49],"this":[50],"work":[51],"we":[52,116],"focus":[53],"on":[54,111],"company":[55,59,93],"entity":[56],"matching,":[57],"where":[58],"name,":[60],"location":[61],"industry":[63],"are":[64],"taken":[65],"into":[66],"account.":[67],"Our":[68],"contribution":[69],"a":[71,85,122],"highly":[72],"scalable,":[73],"enterprise-grade":[74],"end-to-end":[75],"system":[76],"uses":[78],"rule-based":[79],"algorithms":[81],"in":[82,141],"combination":[83],"with":[84,135],"machine":[86],"learning":[87],"approach":[88,120],"account":[90],"short":[92],"names.":[94],"Linkage":[95],"time":[96],"greatly":[98],"reduced":[99],"by":[100],"efficient":[102],"decomposition":[103],"the":[105,136,142],"search":[106],"space":[107],"using":[108],"MinHash.":[109],"Based":[110],"ground":[113],"truth":[114],"datasets,":[115],"show":[117],"our":[119],"reaches":[121],"recall":[123],"91%":[125],"compared":[126],"73%":[128],"baseline":[130],"approaches,":[131],"while":[132],"scaling":[133],"linearly":[134],"number":[137],"nodes":[139],"used":[140],"system.":[143]},"counts_by_year":[{"year":2020,"cited_by_count":2},{"year":2019,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
