{"id":"https://openalex.org/W2963784432","doi":"https://doi.org/10.1109/bigdata47090.2019.9006095","title":"Fast Record Linkage for Company Entities","display_name":"Fast Record Linkage for Company Entities","publication_year":2019,"publication_date":"2019-12-01","ids":{"openalex":"https://openalex.org/W2963784432","doi":"https://doi.org/10.1109/bigdata47090.2019.9006095","mag":"2963784432"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata47090.2019.9006095","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata47090.2019.9006095","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1907.08667","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5076293356","display_name":"Thomas Gschwind","orcid":"https://orcid.org/0000-0003-0212-4800"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":true,"raw_author_name":"Thomas Gschwind","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM Res.-Zurich, Ruschlikon, Switzerland"],"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM Res.-Zurich, Ruschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012998759","display_name":"Christoph Miksovic","orcid":"https://orcid.org/0000-0002-7846-3668"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Christoph Miksovic","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM Res.-Zurich, Ruschlikon, Switzerland"],"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM Res.-Zurich, Ruschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040326345","display_name":"Julian Minder","orcid":null},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Julian Minder","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM Res.-Zurich, Ruschlikon, Switzerland"],"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM Res.-Zurich, Ruschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082535778","display_name":"\u041a\u0430\u0446\u044f\u0440\u044b\u043d\u0430 \u041c\u0438\u0440\u044b\u043b\u0435\u043d\u043a\u0430","orcid":"https://orcid.org/0000-0002-1614-6835"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Katsiaryna Mirylenka","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM Res.-Zurich, Ruschlikon, Switzerland"],"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM Res.-Zurich, Ruschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101547078","display_name":"Paolo Scotton","orcid":"https://orcid.org/0000-0003-4737-0108"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Paolo Scotton","raw_affiliation_strings":["IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","IBM"],"affiliations":[{"raw_affiliation_string":"IBM Research \u2014 Zurich, R\u00fcschlikon, Switzerland","institution_ids":["https://openalex.org/I4210126328"]},{"raw_affiliation_string":"IBM","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5076293356"],"corresponding_institution_ids":["https://openalex.org/I4210126328"],"apc_list":null,"apc_paid":null,"fwci":0.62879099,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.72285419,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"623","last_page":"630"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9539999961853027,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9416000247001648,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7470270395278931},{"id":"https://openalex.org/keywords/record-linkage","display_name":"Record linkage","score":0.704459011554718},{"id":"https://openalex.org/keywords/linkage","display_name":"Linkage (software)","score":0.6971012949943542},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.614661455154419},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5630542039871216},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5618675947189331},{"id":"https://openalex.org/keywords/analytics","display_name":"Analytics","score":0.5542473196983337},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5113279223442078},{"id":"https://openalex.org/keywords/master-data","display_name":"Master data","score":0.4999868869781494},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.49794483184814453},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.49543702602386475},{"id":"https://openalex.org/keywords/data-integration","display_name":"Data integration","score":0.4270249605178833},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.41948214173316956},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4135342836380005},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3792048394680023},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.2318369746208191}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7470270395278931},{"id":"https://openalex.org/C142210648","wikidata":"https://www.wikidata.org/wiki/Q1266546","display_name":"Record linkage","level":3,"score":0.704459011554718},{"id":"https://openalex.org/C31266012","wikidata":"https://www.wikidata.org/wiki/Q6554340","display_name":"Linkage (software)","level":3,"score":0.6971012949943542},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.614661455154419},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5630542039871216},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5618675947189331},{"id":"https://openalex.org/C79158427","wikidata":"https://www.wikidata.org/wiki/Q485396","display_name":"Analytics","level":2,"score":0.5542473196983337},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5113279223442078},{"id":"https://openalex.org/C61871575","wikidata":"https://www.wikidata.org/wiki/Q384093","display_name":"Master data","level":2,"score":0.4999868869781494},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.49794483184814453},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.49543702602386475},{"id":"https://openalex.org/C72634772","wikidata":"https://www.wikidata.org/wiki/Q386824","display_name":"Data integration","level":2,"score":0.4270249605178833},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.41948214173316956},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4135342836380005},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3792048394680023},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.2318369746208191},{"id":"https://openalex.org/C2908647359","wikidata":"https://www.wikidata.org/wiki/Q2625603","display_name":"Population","level":2,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C149923435","wikidata":"https://www.wikidata.org/wiki/Q37732","display_name":"Demography","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/bigdata47090.2019.9006095","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata47090.2019.9006095","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1907.08667","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1907.08667","pdf_url":"https://arxiv.org/pdf/1907.08667","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:2963784432","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/1907.08667","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.1907.08667","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1907.08667","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1907.08667","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1907.08667","pdf_url":"https://arxiv.org/pdf/1907.08667","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.6200000047683716}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2963784432.pdf","grobid_xml":"https://content.openalex.org/works/W2963784432.grobid-xml"},"referenced_works_count":39,"referenced_works":["https://openalex.org/W36024057","https://openalex.org/W102708294","https://openalex.org/W1550430455","https://openalex.org/W1736726159","https://openalex.org/W1940872118","https://openalex.org/W1991800036","https://openalex.org/W1992346944","https://openalex.org/W2012833704","https://openalex.org/W2041439319","https://openalex.org/W2073471108","https://openalex.org/W2097263486","https://openalex.org/W2127050919","https://openalex.org/W2132069633","https://openalex.org/W2143996849","https://openalex.org/W2147880316","https://openalex.org/W2397770138","https://openalex.org/W2544062944","https://openalex.org/W2546672044","https://openalex.org/W2605305417","https://openalex.org/W2702570413","https://openalex.org/W2767681556","https://openalex.org/W2798649495","https://openalex.org/W2809779196","https://openalex.org/W2925156238","https://openalex.org/W2932842219","https://openalex.org/W3013357708","https://openalex.org/W4230502578","https://openalex.org/W4230940751","https://openalex.org/W4300601563","https://openalex.org/W6601540176","https://openalex.org/W6604189946","https://openalex.org/W6640362995","https://openalex.org/W6678771216","https://openalex.org/W6679663036","https://openalex.org/W6682082992","https://openalex.org/W6735896536","https://openalex.org/W6752790590","https://openalex.org/W6761276267","https://openalex.org/W6775149449"],"related_works":["https://openalex.org/W2892179619","https://openalex.org/W2922276810","https://openalex.org/W1981578383","https://openalex.org/W1568445463","https://openalex.org/W1547612978","https://openalex.org/W2143683307","https://openalex.org/W2112263062","https://openalex.org/W2187801565","https://openalex.org/W3210445714","https://openalex.org/W3208996364","https://openalex.org/W3194927518","https://openalex.org/W2013909137","https://openalex.org/W2942536483","https://openalex.org/W3109346489","https://openalex.org/W3134521310","https://openalex.org/W3121361745","https://openalex.org/W109373635","https://openalex.org/W2593617781","https://openalex.org/W2113855145","https://openalex.org/W2186829629"],"abstract_inverted_index":{"Record":[0],"linkage":[1,54,118,147],"is":[2,25,55,108,134,149],"an":[3,109],"essential":[4],"part":[5],"of":[6,140,157,175,193],"nearly":[7],"all":[8],"real-world":[9,163],"systems":[10],"that":[11,115,169],"consume":[12],"structured":[13],"and":[14,33,46,83,100],"unstructured":[15],"data":[16,31,34,44,71,82,85],"coming":[17],"from":[18],"different":[19],"sources.":[20],"Typically":[21],"no":[22],"common":[23],"key":[24],"available":[26],"for":[27,128,180],"connecting":[28],"records.":[29],"Massive":[30],"cleaning":[32],"integration":[35],"processes":[36],"often":[37],"have":[38],"to":[39,80,126,178],"be":[40,50],"completed":[41],"before":[42],"any":[43],"analytics":[45],"further":[47,77],"processing":[48],"can":[49],"performed.":[51],"Although":[52],"record":[53],"frequently":[56],"regarded":[57],"as":[58],"a":[59,122,173],"somewhat":[60],"tedious":[61],"but":[62],"necessary":[63],"step,":[64],"it":[65],"reveals":[66],"valuable":[67],"insights":[68,75],"into":[69,104],"the":[70,81,141,152,158,191,197],"at":[72],"hand.":[73],"These":[74,183],"guide":[76],"analytic":[78],"approaches":[79],"support":[84],"visualization.":[86],"In":[87],"this":[88],"work":[89],"we":[90,167],"focus":[91],"on":[92,162],"company":[93,97,130],"entity":[94],"matching,":[95],"where":[96],"name,":[98],"location":[99],"industry":[101],"are":[102,185],"taken":[103],"account.":[105],"Our":[106],"contribution":[107],"end-to-end,":[110],"highly":[111],"scalable,":[112],"enterprise-grade":[113],"system":[114],"uses":[116],"rule-based":[117],"algorithms":[119],"extended":[120],"with":[121,190],"machine":[123],"learning":[124],"approach":[125,171],"account":[127],"short":[129],"names.":[131],"Linkage":[132],"time":[133],"greatly":[135],"reduced":[136],"by":[137,151],"efficient":[138],"decomposition":[139],"search":[142],"space":[143],"using":[144],"MinHash.":[145],"High":[146],"accuracy":[148],"achieved":[150,186],"proposed":[153],"thorough":[154],"scoring":[155],"process":[156],"matching":[159],"candidates.":[160],"Based":[161],"ground":[164],"truth":[165],"datasets,":[166],"show":[168],"our":[170],"reaches":[172],"recall":[174],"91%":[176],"compared":[177],"73%":[179],"baseline":[181],"approaches.":[182],"results":[184],"while":[187],"scaling":[188],"linearly":[189],"number":[192],"nodes":[194],"used":[195],"in":[196],"system.":[198]},"counts_by_year":[{"year":2020,"cited_by_count":2},{"year":2019,"cited_by_count":1}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
