{"id":"https://openalex.org/W2970491965","doi":"https://doi.org/10.1145/3328519.3329127","title":"Effective and Efficient Data Cleaning for Entity Matching","display_name":"Effective and Efficient Data Cleaning for Entity Matching","publication_year":2019,"publication_date":"2019-07-05","ids":{"openalex":"https://openalex.org/W2970491965","doi":"https://doi.org/10.1145/3328519.3329127","mag":"2970491965"},"language":"en","primary_location":{"id":"doi:10.1145/3328519.3329127","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3328519.3329127","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Workshop on Human-In-the-Loop Data Analytics","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5053328650","display_name":"Jing Ao","orcid":"https://orcid.org/0009-0000-4104-1368"},"institutions":[{"id":"https://openalex.org/I137902535","display_name":"North Carolina State University","ror":"https://ror.org/04tj63d06","country_code":"US","type":"education","lineage":["https://openalex.org/I137902535"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jing Ao","raw_affiliation_strings":["North Carolina State University"],"affiliations":[{"raw_affiliation_string":"North Carolina State University","institution_ids":["https://openalex.org/I137902535"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5078910758","display_name":"Rada Chirkova","orcid":"https://orcid.org/0000-0003-4249-9690"},"institutions":[{"id":"https://openalex.org/I137902535","display_name":"North Carolina State University","ror":"https://ror.org/04tj63d06","country_code":"US","type":"education","lineage":["https://openalex.org/I137902535"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rada Chirkova","raw_affiliation_strings":["North Carolina State University"],"affiliations":[{"raw_affiliation_string":"North Carolina State University","institution_ids":["https://openalex.org/I137902535"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5053328650"],"corresponding_institution_ids":["https://openalex.org/I137902535"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.12008886,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.9779000282287598,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11819","display_name":"Data-Driven Disease Surveillance","score":0.9445000290870667,"subfield":{"id":"https://openalex.org/subfields/2713","display_name":"Epidemiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/tuple","display_name":"Tuple","score":0.8021588325500488},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7987104654312134},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.6512168645858765},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.6085981130599976},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5910014510154724},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5747547149658203},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5716157555580139},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.5557003617286682},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.5379932522773743},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.5233542323112488},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.4986073970794678},{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.4721051752567291},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4619671702384949},{"id":"https://openalex.org/keywords/human-in-the-loop","display_name":"Human-in-the-loop","score":0.45885398983955383},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4478335678577423},{"id":"https://openalex.org/keywords/domain-knowledge","display_name":"Domain knowledge","score":0.4219597578048706},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2900185286998749},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.10317721962928772}],"concepts":[{"id":"https://openalex.org/C118930307","wikidata":"https://www.wikidata.org/wiki/Q600590","display_name":"Tuple","level":2,"score":0.8021588325500488},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7987104654312134},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.6512168645858765},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6085981130599976},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5910014510154724},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5747547149658203},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5716157555580139},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5557003617286682},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.5379932522773743},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.5233542323112488},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.4986073970794678},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.4721051752567291},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4619671702384949},{"id":"https://openalex.org/C2780626000","wikidata":"https://www.wikidata.org/wiki/Q5936775","display_name":"Human-in-the-loop","level":2,"score":0.45885398983955383},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4478335678577423},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.4219597578048706},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2900185286998749},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10317721962928772},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C118615104","wikidata":"https://www.wikidata.org/wiki/Q121416","display_name":"Discrete mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C111368507","wikidata":"https://www.wikidata.org/wiki/Q43518","display_name":"Oceanography","level":1,"score":0.0},{"id":"https://openalex.org/C78519656","wikidata":"https://www.wikidata.org/wiki/Q101333","display_name":"Mechanical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3328519.3329127","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3328519.3329127","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Workshop on Human-In-the-Loop Data Analytics","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4699999988079071,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1547612978","https://openalex.org/W1619226191","https://openalex.org/W1975184797","https://openalex.org/W2053503869","https://openalex.org/W2063103859","https://openalex.org/W2099637074","https://openalex.org/W2101654853","https://openalex.org/W2108991785","https://openalex.org/W2139646386","https://openalex.org/W2155344811","https://openalex.org/W2156504490","https://openalex.org/W2156571267","https://openalex.org/W2165580920","https://openalex.org/W2238711864","https://openalex.org/W2421097601","https://openalex.org/W2437617937","https://openalex.org/W2481573984","https://openalex.org/W2542998387","https://openalex.org/W2548122763","https://openalex.org/W2612732335","https://openalex.org/W2750620035","https://openalex.org/W2807685335","https://openalex.org/W2809037461","https://openalex.org/W2811488946","https://openalex.org/W2896587760","https://openalex.org/W3006449229","https://openalex.org/W3097993951","https://openalex.org/W3099883947","https://openalex.org/W3146259567","https://openalex.org/W4289236186","https://openalex.org/W4293582904","https://openalex.org/W4321508312"],"related_works":["https://openalex.org/W2058965144","https://openalex.org/W2164382479","https://openalex.org/W2146343568","https://openalex.org/W98480971","https://openalex.org/W2150291671","https://openalex.org/W2013643406","https://openalex.org/W2027972911","https://openalex.org/W2157978810","https://openalex.org/W2597809628","https://openalex.org/W2748020237"],"abstract_inverted_index":{"As":[0],"a":[1,76,125,175],"key":[2],"data-integration":[3],"step,":[4],"entity":[5],"matching":[6],"(EM)":[7],"identifies":[8],"tuples":[9],"referring":[10],"to":[11,73,97,118,171,199],"the":[12,23,34,37,45,63,88,107,129,137,151,158,167,181,187,195,201,204],"same":[13,38],"real-world":[14],"entities":[15],"in":[16,33,79,87,105,110,180,190,197],"disparate":[17],"data":[18,49],"sources.":[19],"In":[20,57,135],"many":[21],"cases,":[22],"EM":[24,108,120,159,184],"quality":[25],"can":[26],"be":[27,55,85],"improved":[28],"by":[29,51,71,102,150],"repairing":[30],"incorrect":[31],"values":[32,81],"data;":[35],"at":[36],"time,":[39,101],"it":[40],"is":[41,115],"well":[42],"known":[43],"that":[44,114,166],"time":[46,130,178],"costs":[47],"of":[48,82,147,177,183,203],"cleaning":[50,94,106,138],"human":[52,74,99],"experts":[53,75,193],"could":[54,84],"prohibitive.":[56],"this":[58],"paper,":[59],"we":[60],"focus":[61],"on":[62,128,133],"time-consuming":[64],"human-in-the-loop":[65],"data-cleaning":[66],"problem":[67],"for":[68,174],"relational":[69],"EM,":[70],"recommending":[72],"time-efficient":[77],"order":[78,113],"which":[80,191,198],"attributes":[83,202],"cleaned":[86],"given":[89,126],"data.":[90],"Our":[91,161],"proposed":[92,168],"domain-independent":[93],"framework":[95],"aims":[96],"save":[98],"users'":[100],"guiding":[103,136],"them":[104],"inputs":[109],"an":[111],"attribute":[112],"as":[116,122],"conducive":[117],"maximizing":[119],"accuracy":[121,185],"possible":[123],"within":[124],"constraint":[127],"they":[131],"spend":[132],"cleaning.":[134],"process,":[139],"our":[140],"attribute-recommendation":[141],"methods":[142],"discover":[143],"and":[144,153],"take":[145],"advantage":[146],"information":[148],"provided":[149],"data,":[152],"also":[154],"use":[155],"feedback":[156],"from":[157],"engine.":[160],"preliminary":[162],"experimental":[163],"results":[164],"suggest":[165],"approach":[169],"leads":[170],"measurable":[172],"speedup,":[173],"variety":[176],"constraints,":[179],"improvement":[182],"over":[186],"baseline":[188],"approach,":[189],"domain":[192],"choose":[194],"sequence":[196],"clean":[200],"inputs.":[205]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
