{"id":"https://openalex.org/W3014705052","doi":"https://doi.org/10.14778/3421424.3421431","title":"Deep entity matching with pre-trained language models","display_name":"Deep entity matching with pre-trained language models","publication_year":2020,"publication_date":"2020-09-01","ids":{"openalex":"https://openalex.org/W3014705052","doi":"https://doi.org/10.14778/3421424.3421431","mag":"3014705052"},"language":"en","primary_location":{"id":"doi:10.14778/3421424.3421431","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3421424.3421431","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2004.00584","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yuliang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yuliang Li","raw_affiliation_strings":["Megagon Labs"],"affiliations":[{"raw_affiliation_string":"Megagon Labs","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jinfeng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinfeng Li","raw_affiliation_strings":["Megagon Labs"],"affiliations":[{"raw_affiliation_string":"Megagon Labs","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yoshihiko Suhara","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yoshihiko Suhara","raw_affiliation_strings":["Megagon Labs"],"affiliations":[{"raw_affiliation_string":"Megagon Labs","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"AnHai Doan","orcid":null},"institutions":[{"id":"https://openalex.org/I135310074","display_name":"University of Wisconsin\u2013Madison","ror":"https://ror.org/01y2jtd41","country_code":"US","type":"education","lineage":["https://openalex.org/I135310074"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"AnHai Doan","raw_affiliation_strings":["University of Wisconsin Madison"],"affiliations":[{"raw_affiliation_string":"University of Wisconsin Madison","institution_ids":["https://openalex.org/I135310074"]}]},{"author_position":"last","author":{"id":null,"display_name":"Wang-Chiew Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang-Chiew Tan","raw_affiliation_strings":["Megagon Labs"],"affiliations":[{"raw_affiliation_string":"Megagon Labs","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":23.2183,"has_fulltext":false,"cited_by_count":277,"citation_normalized_percentile":{"value":0.99682194,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":100},"biblio":{"volume":"14","issue":"1","first_page":"50","last_page":"60"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9262999892234802,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9262999892234802,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.033399999141693115,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.00989999994635582,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.8169000148773193},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.7003999948501587},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6710000038146973},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5216000080108643},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.396699994802475},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.384799987077713},{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.3391999900341034},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.3361000120639801}],"concepts":[{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.8169000148773193},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7328000068664551},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.7003999948501587},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6710000038146973},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6101999878883362},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5216000080108643},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.42660000920295715},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.42080000042915344},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.396699994802475},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.384799987077713},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.3391999900341034},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3361000120639801},{"id":"https://openalex.org/C68859911","wikidata":"https://www.wikidata.org/wiki/Q1503724","display_name":"Pattern matching","level":2,"score":0.325300008058548},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.2996000051498413},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.29750001430511475},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.289000004529953},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2782000005245209},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.26030001044273376},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.2597000002861023},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25850000977516174},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.25589999556541443},{"id":"https://openalex.org/C123853557","wikidata":"https://www.wikidata.org/wiki/Q7098946","display_name":"Optimal matching","level":3,"score":0.2551000118255615},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.25049999356269836}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.14778/3421424.3421431","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3421424.3421431","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2004.00584","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2004.00584","pdf_url":"https://arxiv.org/pdf/2004.00584","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2004.00584","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2004.00584","pdf_url":"https://arxiv.org/pdf/2004.00584","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":39,"referenced_works":["https://openalex.org/W1976437052","https://openalex.org/W1981590391","https://openalex.org/W2014327223","https://openalex.org/W2031250218","https://openalex.org/W2056748234","https://openalex.org/W2064675550","https://openalex.org/W2067566391","https://openalex.org/W2107966677","https://openalex.org/W2114764731","https://openalex.org/W2139077857","https://openalex.org/W2145492473","https://openalex.org/W2154785834","https://openalex.org/W2164456230","https://openalex.org/W2182703380","https://openalex.org/W2250539671","https://openalex.org/W2493916176","https://openalex.org/W2542998387","https://openalex.org/W2741075451","https://openalex.org/W2767681556","https://openalex.org/W2775696413","https://openalex.org/W2798416089","https://openalex.org/W2798649495","https://openalex.org/W2886950694","https://openalex.org/W2911489562","https://openalex.org/W2945623882","https://openalex.org/W2945883855","https://openalex.org/W2946417913","https://openalex.org/W2946504770","https://openalex.org/W2951147191","https://openalex.org/W2957204582","https://openalex.org/W2962965405","https://openalex.org/W2970641574","https://openalex.org/W2971296908","https://openalex.org/W2979826702","https://openalex.org/W2984651502","https://openalex.org/W2997591727","https://openalex.org/W3004437239","https://openalex.org/W3013103751","https://openalex.org/W3014705052"],"related_works":[],"abstract_inverted_index":{"We":[0,14,74],"present":[1],"Ditto,":[2],"a":[3,20,29,36,132,206,224],"novel":[4],"entity":[5],"matching":[6,57,84,107,162,212],"system":[7],"based":[8],"on":[9,49,71,135,205],"pre-trained":[10,48],"Transformer-based":[11],"language":[12,40],"models.":[13],"fine-tune":[15],"and":[16,59,125,219],"cast":[17],"EM":[18,141,209],"as":[19,43],"sequence-pair":[21],"classification":[22],"problem":[23],"to":[24,66,80,90,140,142,155,158,176],"leverage":[25],"such":[26,42],"models":[27,41],"with":[28,147,191],"simple":[30],"architecture.":[31],"Our":[32],"experiments":[33],"show":[34],"that":[35,100,113,118,183],"straight-forward":[37],"application":[38],"of":[39,68,97,103,172,197,217,228],"BERT,":[44],"DistilBERT,":[45],"or":[46],"RoBERTa":[47],"large":[50],"text":[51,139],"corpora":[52],"already":[53],"significantly":[54],"improves":[55],"the":[56,120,144,160,170,187,195],"quality":[58],"outperforms":[60],"previous":[61,188],"state-of-the-art":[62],"(SOTA),":[63],"by":[64,93,174],"up":[65,175],"29%":[67],"F1":[69,226],"score":[70,227],"benchmark":[72],"datasets.":[73],"also":[75,110],"developed":[76,167],"three":[77],"optimization":[78],"techniques":[79],"further":[81,168],"improve":[82,159],"Ditto's":[83,203],"capability.":[85,163],"Ditto":[86,109,130,152,173,184,222],"allows":[87],"domain":[88],"knowledge":[89],"be":[91,102],"injected":[92],"highlighting":[94],"important":[95],"pieces":[96],"input":[98],"information":[99,122],"may":[101],"interest":[104],"when":[105],"making":[106],"decisions.":[108],"summarizes":[111],"strings":[112],"are":[114],"too":[115],"long":[116],"so":[117],"only":[119],"essential":[121],"is":[123,153],"retained":[124],"used":[126],"for":[127,138],"EM.":[128],"Finally,":[129,200],"adapts":[131],"SOTA":[133,189],"technique":[134],"data":[136,146],"augmentation":[137],"augment":[143],"training":[145],"(difficult)":[148],"examples.":[149],"This":[150],"way,":[151],"forced":[154],"learn":[156],"\"harder\"":[157],"model's":[161],"The":[164],"optimizations":[165],"we":[166,181,201],"boost":[169],"performance":[171],"9.8%.":[177],"Perhaps":[178],"more":[179],"surprisingly,":[180],"establish":[182],"can":[185],"achieve":[186],"results":[190],"at":[192],"most":[193],"half":[194],"number":[196],"labeled":[198],"data.":[199],"demonstrate":[202],"effectiveness":[204],"real-world":[207],"large-scale":[208],"task.":[210],"On":[211],"two":[213],"company":[214],"datasets":[215],"consisting":[216],"789K":[218],"412K":[220],"records,":[221],"achieves":[223],"high":[225],"96.5%.":[229]},"counts_by_year":[{"year":2026,"cited_by_count":16},{"year":2025,"cited_by_count":70},{"year":2024,"cited_by_count":66},{"year":2023,"cited_by_count":52},{"year":2022,"cited_by_count":39},{"year":2021,"cited_by_count":31},{"year":2020,"cited_by_count":3}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2020-04-10T00:00:00"}
