{"id":"https://openalex.org/W3013103751","doi":"https://doi.org/10.1145/3318464.3380597","title":"A Comprehensive Benchmark Framework for Active Learning Methods in Entity Matching","display_name":"A Comprehensive Benchmark Framework for Active Learning Methods in Entity Matching","publication_year":2020,"publication_date":"2020-05-29","ids":{"openalex":"https://openalex.org/W3013103751","doi":"https://doi.org/10.1145/3318464.3380597","mag":"3013103751"},"language":"en","primary_location":{"id":"doi:10.1145/3318464.3380597","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3318464.3380597","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2020 ACM SIGMOD International Conference on Management of Data","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2003.13114","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Venkata Vamsikrishna Meduri","orcid":null},"institutions":[{"id":"https://openalex.org/I55732556","display_name":"Arizona State University","ror":"https://ror.org/03efmqc40","country_code":"US","type":"education","lineage":["https://openalex.org/I55732556"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Venkata Vamsikrishna Meduri","raw_affiliation_strings":["Arizona State University, Tempe, AZ, USA"],"affiliations":[{"raw_affiliation_string":"Arizona State University, Tempe, AZ, USA","institution_ids":["https://openalex.org/I55732556"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Lucian Popa","orcid":null},"institutions":[{"id":"https://openalex.org/I4210085935","display_name":"IBM Research - Almaden","ror":"https://ror.org/005w8dd04","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210085935","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lucian Popa","raw_affiliation_strings":["IBM Research, Almaden, San Jose, CA, USA"],"affiliations":[{"raw_affiliation_string":"IBM Research, Almaden, San Jose, CA, USA","institution_ids":["https://openalex.org/I4210085935"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Prithviraj Sen","orcid":null},"institutions":[{"id":"https://openalex.org/I4210085935","display_name":"IBM Research - Almaden","ror":"https://ror.org/005w8dd04","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210085935","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Prithviraj Sen","raw_affiliation_strings":["IBM Research, Almaden, San Jose, CA, USA"],"affiliations":[{"raw_affiliation_string":"IBM Research, Almaden, San Jose, CA, USA","institution_ids":["https://openalex.org/I4210085935"]}]},{"author_position":"last","author":{"id":null,"display_name":"Mohamed Sarwat","orcid":null},"institutions":[{"id":"https://openalex.org/I55732556","display_name":"Arizona State University","ror":"https://ror.org/03efmqc40","country_code":"US","type":"education","lineage":["https://openalex.org/I55732556"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mohamed Sarwat","raw_affiliation_strings":["Arizona State University, Tempe, AZ, USA"],"affiliations":[{"raw_affiliation_string":"Arizona State University, Tempe, AZ, USA","institution_ids":["https://openalex.org/I55732556"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I55732556"],"apc_list":null,"apc_paid":null,"fwci":4.526,"has_fulltext":false,"cited_by_count":47,"citation_normalized_percentile":{"value":0.95436085,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1133","last_page":"1147"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9944000244140625,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/active-learning","display_name":"Active learning (machine learning)","score":0.7240999937057495},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.5703999996185303},{"id":"https://openalex.org/keywords/semi-supervised-learning","display_name":"Semi-supervised learning","score":0.5626999735832214},{"id":"https://openalex.org/keywords/oracle","display_name":"Oracle","score":0.5393000245094299},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5378000140190125},{"id":"https://openalex.org/keywords/supervised-learning","display_name":"Supervised learning","score":0.486299991607666},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.47920000553131104},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.43790000677108765}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7813000082969666},{"id":"https://openalex.org/C77967617","wikidata":"https://www.wikidata.org/wiki/Q4677561","display_name":"Active learning (machine learning)","level":2,"score":0.7240999937057495},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.7073000073432922},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6341000199317932},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.5703999996185303},{"id":"https://openalex.org/C58973888","wikidata":"https://www.wikidata.org/wiki/Q1041418","display_name":"Semi-supervised learning","level":2,"score":0.5626999735832214},{"id":"https://openalex.org/C55166926","wikidata":"https://www.wikidata.org/wiki/Q2892946","display_name":"Oracle","level":2,"score":0.5393000245094299},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5378000140190125},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.486299991607666},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.47920000553131104},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.43790000677108765},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.4309999942779541},{"id":"https://openalex.org/C24138899","wikidata":"https://www.wikidata.org/wiki/Q17141258","display_name":"Instance-based learning","level":3,"score":0.40959998965263367},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.36959999799728394},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3677000105381012},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.329800009727478},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.30869999527931213},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C199190896","wikidata":"https://www.wikidata.org/wiki/Q3509276","display_name":"Learning classifier system","level":3,"score":0.2694999873638153},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.25699999928474426},{"id":"https://openalex.org/C90673727","wikidata":"https://www.wikidata.org/wiki/Q901718","display_name":"Product (mathematics)","level":2,"score":0.2551000118255615}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3318464.3380597","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3318464.3380597","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2020 ACM SIGMOD International Conference on Management of Data","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2003.13114","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2003.13114","pdf_url":"https://arxiv.org/pdf/2003.13114","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2003.13114","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2003.13114","pdf_url":"https://arxiv.org/pdf/2003.13114","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1528361845","https://openalex.org/W1968200975","https://openalex.org/W1981590391","https://openalex.org/W2052698082","https://openalex.org/W2053653724","https://openalex.org/W2067566391","https://openalex.org/W2081118310","https://openalex.org/W2106675345","https://openalex.org/W2107966677","https://openalex.org/W2119320829","https://openalex.org/W2163051926","https://openalex.org/W2167595980","https://openalex.org/W2170902582","https://openalex.org/W2428834396","https://openalex.org/W2535778619","https://openalex.org/W2542998387","https://openalex.org/W2612526608","https://openalex.org/W2612964472","https://openalex.org/W2767681556","https://openalex.org/W2775696413","https://openalex.org/W2798649495","https://openalex.org/W3012207844","https://openalex.org/W3106889297","https://openalex.org/W6664916307","https://openalex.org/W6696781119"],"related_works":[],"abstract_inverted_index":{"Entity":[0],"Matching":[1],"(EM)":[2],"is":[3,21,86,170],"a":[4,59,126,151],"core":[5],"data":[6,31],"cleaning":[7],"task,":[8],"aiming":[9],"to":[10,24,40,70,87,94,120,210],"identify":[11],"different":[12,73],"mentions":[13],"of":[14,28,83,128,153,163,192,201,216],"the":[15,26,37,48,84,164,177,190,193,214,217],"same":[16],"real-world":[17],"entity.":[18],"Active":[19],"learning":[20,62,74,97,123,145,173,180],"one":[22],"way":[23],"address":[25],"challenge":[27],"scarce":[29],"labeled":[30,42],"in":[32,199],"practice,":[33],"by":[34,43,196,208],"dynamically":[35],"collecting":[36],"necessary":[38],"examples":[39],"be":[41],"an":[44,171],"Oracle":[45],"and":[46,117,134,203],"refining":[47],"learned":[49,194],"model":[50,195],"(classifier)":[51],"upon":[52],"them.":[53],"In":[54,159],"this":[55],"paper,":[56],"we":[57,106,166],"build":[58],"unified":[60],"active":[61,96,122,144,172],"benchmark":[63],"framework":[64,85,183],"for":[65,91,102,161],"EM":[66,113,131],"that":[67,143,168,175,188],"allows":[68],"users":[69],"easily":[71],"combine":[72],"algorithms":[75],"with":[76,146],"applicable":[77],"example":[78,135,205],"selection":[79,136,206],"algorithms.":[80],"The":[81],"goal":[82],"enable":[88],"concrete":[89],"guidelines":[90],"practitioners":[92],"as":[93,156],"what":[95],"combinations":[98],"will":[99],"work":[100],"well":[101],"EM.":[103],"Towards":[104],"this,":[105],"perform":[107],"comprehensive":[108],"experiments":[109],"on":[110],"publicly":[111],"available":[112],"datasets":[114],"from":[115],"product":[116],"publication":[118],"domains":[119],"evaluate":[121],"methods,":[124],"using":[125],"variety":[127],"metrics":[129],"including":[130],"quality,":[132],"#labels":[133],"latencies.":[137],"Our":[138,182],"most":[139],"surprising":[140],"result":[141],"finds":[142],"fewer":[147],"labels":[148],"can":[149],"learn":[150],"classifier":[152],"comparable":[154],"quality":[155,191,215],"supervised":[157,179],"learning.":[158],"fact,":[160],"several":[162],"datasets,":[165],"show":[167],"there":[169],"combination":[174],"beats":[176],"state-of-the-art":[178],"result.":[181],"also":[184],"includes":[185],"novel":[186],"optimizations":[187],"improve":[189],"roughly":[197],"9%":[198],"terms":[200],"F1-score":[202],"reduce":[204],"latencies":[207],"up":[209],"10\u00d7":[211],"without":[212],"affecting":[213],"model.":[218]},"counts_by_year":[{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":9},{"year":2022,"cited_by_count":8},{"year":2021,"cited_by_count":11},{"year":2020,"cited_by_count":5}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2020-04-03T00:00:00"}
