{"id":"https://openalex.org/W2145503758","doi":"https://doi.org/10.4304/jcp.5.12.1800-1809","title":"A Domain-Independent Data Cleaning Algorithm for Detecting Similar-Duplicates","display_name":"A Domain-Independent Data Cleaning Algorithm for Detecting Similar-Duplicates","publication_year":2010,"publication_date":"2010-12-01","ids":{"openalex":"https://openalex.org/W2145503758","doi":"https://doi.org/10.4304/jcp.5.12.1800-1809","mag":"2145503758"},"language":"en","primary_location":{"id":"doi:10.4304/jcp.5.12.1800-1809","is_oa":false,"landing_page_url":"https://doi.org/10.4304/jcp.5.12.1800-1809","pdf_url":null,"source":{"id":"https://openalex.org/S77894049","display_name":"Journal of Computers","issn_l":"1796-203X","issn":["1796-203X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318660","host_organization_name":"Academy Publisher","host_organization_lineage":["https://openalex.org/P4310318660"],"host_organization_lineage_names":["Academy Publisher"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Computers","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023057399","display_name":"Kazi Shah Nawaz Ripon","orcid":"https://orcid.org/0000-0002-6551-9714"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kazi Shah Nawaz Ripon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101726125","display_name":"Md Ashiqur Rahman","orcid":"https://orcid.org/0000-0002-2933-2637"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ashiqur Rahman","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5014708079","display_name":"G. M. Atiqur Rahaman","orcid":"https://orcid.org/0000-0001-7387-6650"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"G.M. Atiqur Rahaman","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5023057399"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.391,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.70465762,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":"5","issue":"12","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6786471009254456},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.5975342988967896},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.4995701313018799},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.34901559352874756},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.16514837741851807}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6786471009254456},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5975342988967896},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4995701313018799},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.34901559352874756},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.16514837741851807},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.4304/jcp.5.12.1800-1809","is_oa":false,"landing_page_url":"https://doi.org/10.4304/jcp.5.12.1800-1809","pdf_url":null,"source":{"id":"https://openalex.org/S77894049","display_name":"Journal of Computers","issn_l":"1796-203X","issn":["1796-203X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318660","host_organization_name":"Academy Publisher","host_organization_lineage":["https://openalex.org/P4310318660"],"host_organization_lineage_names":["Academy Publisher"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Computers","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5099999904632568,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W25706487","https://openalex.org/W1484228408","https://openalex.org/W1513543741","https://openalex.org/W1559390933","https://openalex.org/W1569123402","https://openalex.org/W1612155886","https://openalex.org/W1647671624","https://openalex.org/W2010392031","https://openalex.org/W2024770506","https://openalex.org/W2050071106","https://openalex.org/W2065290081","https://openalex.org/W2087064593","https://openalex.org/W2101939932","https://openalex.org/W2107976925","https://openalex.org/W2108991785","https://openalex.org/W2111192396","https://openalex.org/W2131576956","https://openalex.org/W2134826720","https://openalex.org/W2259773661"],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2051487156","https://openalex.org/W2073681303","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2382290278","https://openalex.org/W2350741829","https://openalex.org/W2130043461"],"abstract_inverted_index":{"Normal":[0],"0":[3],"MicrosoftInternetExplorer4":[18],"Data":[36],"mining":[37,100],"algorithms":[38,144],"generally":[39],"assume":[40],"that":[41,166],"data":[42,73,99],"will":[43,102],"be":[44,103],"clean":[45],"and":[46,57,63,146,190],"consistent.":[47],"However,":[48],"in":[49],"practice,":[50],"this":[51,59,121],"is":[52,68,110,158],"not":[53],"always":[54],"the":[55,61,85,91,94,98,116,132,155,181,191,194,201],"case,":[56],"for":[58,129,140,171],"reason":[60],"detection":[62,106,143,173],"elimination":[64],"of":[65,72,77,82,90,107,154,180,193],"duplicate":[66],"records":[67,79,109,117],"an":[69,164],"important":[70],"part":[71],"cleaning.":[74],"The":[75,105,178],"presence":[76],"similar-duplicate":[78,108,133,142,172],"causes":[80],"over-representation":[81],"data.":[83],"If":[84],"database":[86],"contains":[87],"different":[88],"representations":[89],"same":[92],"data,":[93],"results":[95],"obtained":[96],"from":[97],"algorithm":[101,165],"erroneous.":[104],"a":[111,125,151,175],"difficult":[112],"task,":[113],"especially":[114],"when":[115],"are":[118],"domain-independent.":[119],"In":[120,149],"paper,":[122],"we":[123,162],"propose":[124,163],"novel":[126],"domain-independent":[127,176],"technique":[128],"better":[130],"reconciling":[131],"records.":[134],"We":[135],"also":[136,159],"introduce":[137],"new":[138],"ideas":[139],"making":[141],"faster":[145],"more":[147],"efficient.":[148],"addition,":[150],"significant":[152],"modification":[153],"transitivity":[156],"rule":[157],"proposed.":[160],"Finally,":[161],"incorporates":[167],"all":[168],"these":[169],"techniques":[170],"into":[174],"environment.":[177],"performance":[179],"proposed":[182,195],"method":[183,196],"has":[184,197],"been":[185,198],"compared":[186],"to":[187],"other":[188],"methods":[189],"superiority":[192],"confirmed":[199],"by":[200],"experimental":[202],"results.":[203]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":1},{"year":2020,"cited_by_count":1},{"year":2018,"cited_by_count":2},{"year":2015,"cited_by_count":2},{"year":2013,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
