{"id":"https://openalex.org/W2785239010","doi":"https://doi.org/10.1145/3131611","title":"Comparative Analysis of Sequence Clustering Methods for Deduplication of Biological Databases","display_name":"Comparative Analysis of Sequence Clustering Methods for Deduplication of Biological Databases","publication_year":2017,"publication_date":"2017-09-30","ids":{"openalex":"https://openalex.org/W2785239010","doi":"https://doi.org/10.1145/3131611","mag":"2785239010"},"language":"en","primary_location":{"id":"doi:10.1145/3131611","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3131611","pdf_url":null,"source":{"id":"https://openalex.org/S110189822","display_name":"Journal of Data and Information Quality","issn_l":"1936-1955","issn":["1936-1955","1936-1963"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Data and Information Quality","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://figshare.com/articles/journal_contribution/Comparative_analysis_of_sequence_clustering_methods_for_deduplication_of_biological_databases/27510858","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042874172","display_name":"Qingyu Chen","orcid":"https://orcid.org/0000-0002-6036-1516"},"institutions":[{"id":"https://openalex.org/I165779595","display_name":"The University of Melbourne","ror":"https://ror.org/01ej9dk98","country_code":"AU","type":"education","lineage":["https://openalex.org/I165779595"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Qingyu Chen","raw_affiliation_strings":["University of Melbourne, Australia"],"affiliations":[{"raw_affiliation_string":"University of Melbourne, Australia","institution_ids":["https://openalex.org/I165779595"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100377279","display_name":"Yu Wan","orcid":"https://orcid.org/0000-0002-8354-7647"},"institutions":[{"id":"https://openalex.org/I165779595","display_name":"The University of Melbourne","ror":"https://ror.org/01ej9dk98","country_code":"AU","type":"education","lineage":["https://openalex.org/I165779595"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Yu Wan","raw_affiliation_strings":["University of Melbourne, Victoria, Australia"],"affiliations":[{"raw_affiliation_string":"University of Melbourne, Victoria, Australia","institution_ids":["https://openalex.org/I165779595"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021983750","display_name":"Xiuzhen Zhang","orcid":"https://orcid.org/0000-0001-5558-3790"},"institutions":[{"id":"https://openalex.org/I4210095297","display_name":"MIT University","ror":"https://ror.org/00v140q16","country_code":"MK","type":"education","lineage":["https://openalex.org/I4210095297"]},{"id":"https://openalex.org/I82951845","display_name":"RMIT University","ror":"https://ror.org/04ttjf776","country_code":"AU","type":"education","lineage":["https://openalex.org/I82951845"]}],"countries":["AU","MK"],"is_corresponding":false,"raw_author_name":"Xiuzhen Zhang","raw_affiliation_strings":["RMIT University, Melbourne VIC"],"affiliations":[{"raw_affiliation_string":"RMIT University, Melbourne VIC","institution_ids":["https://openalex.org/I4210095297","https://openalex.org/I82951845"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102772304","display_name":"Yang Lei","orcid":"https://orcid.org/0000-0003-3780-6510"},"institutions":[{"id":"https://openalex.org/I165779595","display_name":"The University of Melbourne","ror":"https://ror.org/01ej9dk98","country_code":"AU","type":"education","lineage":["https://openalex.org/I165779595"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Yang Lei","raw_affiliation_strings":["University of Melbourne, Australia"],"affiliations":[{"raw_affiliation_string":"University of Melbourne, Australia","institution_ids":["https://openalex.org/I165779595"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041495909","display_name":"Justin Zobel","orcid":"https://orcid.org/0000-0001-6622-032X"},"institutions":[{"id":"https://openalex.org/I165779595","display_name":"The University of Melbourne","ror":"https://ror.org/01ej9dk98","country_code":"AU","type":"education","lineage":["https://openalex.org/I165779595"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Justin Zobel","raw_affiliation_strings":["University of Melbourne, Australia"],"affiliations":[{"raw_affiliation_string":"University of Melbourne, Australia","institution_ids":["https://openalex.org/I165779595"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067214173","display_name":"Karin Verspoor","orcid":"https://orcid.org/0000-0002-8661-1544"},"institutions":[{"id":"https://openalex.org/I165779595","display_name":"The University of Melbourne","ror":"https://ror.org/01ej9dk98","country_code":"AU","type":"education","lineage":["https://openalex.org/I165779595"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Karin Verspoor","raw_affiliation_strings":["University of Melbourne, Australia"],"affiliations":[{"raw_affiliation_string":"University of Melbourne, Australia","institution_ids":["https://openalex.org/I165779595"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5042874172"],"corresponding_institution_ids":["https://openalex.org/I165779595"],"apc_list":null,"apc_paid":null,"fwci":0.8083,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.78646545,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":"9","issue":"3","first_page":"1","last_page":"27"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12640","display_name":"Environmental DNA in Biodiversity Studies","score":0.9860000014305115,"subfield":{"id":"https://openalex.org/subfields/2303","display_name":"Ecology"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9829999804496765,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.8366363048553467},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7543741464614868},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.7026335000991821},{"id":"https://openalex.org/keywords/biological-database","display_name":"Biological database","score":0.6933234930038452},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.6475688219070435},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.6212303638458252},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.49055761098861694},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.42834362387657166},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3767543435096741},{"id":"https://openalex.org/keywords/bioinformatics","display_name":"Bioinformatics","score":0.16527551412582397},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.10644558072090149},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.09591823816299438}],"concepts":[{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.8366363048553467},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7543741464614868},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.7026335000991821},{"id":"https://openalex.org/C20901353","wikidata":"https://www.wikidata.org/wiki/Q4117139","display_name":"Biological database","level":2,"score":0.6933234930038452},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.6475688219070435},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6212303638458252},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.49055761098861694},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.42834362387657166},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3767543435096741},{"id":"https://openalex.org/C60644358","wikidata":"https://www.wikidata.org/wiki/Q128570","display_name":"Bioinformatics","level":1,"score":0.16527551412582397},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.10644558072090149},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.09591823816299438},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3131611","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3131611","pdf_url":null,"source":{"id":"https://openalex.org/S110189822","display_name":"Journal of Data and Information Quality","issn_l":"1936-1955","issn":["1936-1955","1936-1963"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Data and Information Quality","raw_type":"journal-article"},{"id":"pmh:oai:figshare.com:article/27510858","is_oa":true,"landing_page_url":"https://figshare.com/articles/journal_contribution/Comparative_analysis_of_sequence_clustering_methods_for_deduplication_of_biological_databases/27510858","pdf_url":null,"source":{"id":"https://openalex.org/S4377196282","display_name":"Figshare","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210132348","host_organization_name":"Figshare (United Kingdom)","host_organization_lineage":["https://openalex.org/I4210132348"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Text"}],"best_oa_location":{"id":"pmh:oai:figshare.com:article/27510858","is_oa":true,"landing_page_url":"https://figshare.com/articles/journal_contribution/Comparative_analysis_of_sequence_clustering_methods_for_deduplication_of_biological_databases/27510858","pdf_url":null,"source":{"id":"https://openalex.org/S4377196282","display_name":"Figshare","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210132348","host_organization_name":"Figshare (United Kingdom)","host_organization_lineage":["https://openalex.org/I4210132348"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":100,"referenced_works":["https://openalex.org/W255556494","https://openalex.org/W1598214679","https://openalex.org/W1726853348","https://openalex.org/W1783597500","https://openalex.org/W1854015338","https://openalex.org/W1895017250","https://openalex.org/W1971647276","https://openalex.org/W1980948656","https://openalex.org/W1985059878","https://openalex.org/W1989064680","https://openalex.org/W1993320088","https://openalex.org/W1993519105","https://openalex.org/W2003204068","https://openalex.org/W2003374509","https://openalex.org/W2004291985","https://openalex.org/W2004345492","https://openalex.org/W2008116827","https://openalex.org/W2008830098","https://openalex.org/W2011250480","https://openalex.org/W2031250218","https://openalex.org/W2035407577","https://openalex.org/W2041369648","https://openalex.org/W2041439319","https://openalex.org/W2047242127","https://openalex.org/W2055043387","https://openalex.org/W2060178110","https://openalex.org/W2060797027","https://openalex.org/W2065259291","https://openalex.org/W2072424580","https://openalex.org/W2074231493","https://openalex.org/W2076048958","https://openalex.org/W2077842462","https://openalex.org/W2080200681","https://openalex.org/W2081746584","https://openalex.org/W2084168100","https://openalex.org/W2093794723","https://openalex.org/W2097193844","https://openalex.org/W2097606916","https://openalex.org/W2103017472","https://openalex.org/W2106386982","https://openalex.org/W2107903674","https://openalex.org/W2108166985","https://openalex.org/W2108211735","https://openalex.org/W2108991785","https://openalex.org/W2113231090","https://openalex.org/W2113586398","https://openalex.org/W2116699248","https://openalex.org/W2117709367","https://openalex.org/W2118101156","https://openalex.org/W2121947440","https://openalex.org/W2123023820","https://openalex.org/W2124351063","https://openalex.org/W2125587588","https://openalex.org/W2129800387","https://openalex.org/W2133098435","https://openalex.org/W2135083016","https://openalex.org/W2137479650","https://openalex.org/W2140190241","https://openalex.org/W2144252381","https://openalex.org/W2144420897","https://openalex.org/W2145036943","https://openalex.org/W2145084154","https://openalex.org/W2145349611","https://openalex.org/W2156125289","https://openalex.org/W2156357245","https://openalex.org/W2156619206","https://openalex.org/W2159348493","https://openalex.org/W2161550872","https://openalex.org/W2170747616","https://openalex.org/W2224056471","https://openalex.org/W2273522830","https://openalex.org/W2287926972","https://openalex.org/W2295127861","https://openalex.org/W2323375945","https://openalex.org/W2343964059","https://openalex.org/W2344427668","https://openalex.org/W2399636967","https://openalex.org/W2404565489","https://openalex.org/W2472351724","https://openalex.org/W2492295672","https://openalex.org/W2493860774","https://openalex.org/W2518938523","https://openalex.org/W2520610372","https://openalex.org/W2546848075","https://openalex.org/W2556455564","https://openalex.org/W2557595285","https://openalex.org/W2562773385","https://openalex.org/W2569138550","https://openalex.org/W2579441661","https://openalex.org/W2604808360","https://openalex.org/W2605068739","https://openalex.org/W2739999456","https://openalex.org/W2949155040","https://openalex.org/W2950894235","https://openalex.org/W2964648167","https://openalex.org/W4206671592","https://openalex.org/W4234714728","https://openalex.org/W4235121031","https://openalex.org/W4247554111","https://openalex.org/W4299497146"],"related_works":["https://openalex.org/W3144870715","https://openalex.org/W3142319788","https://openalex.org/W2587188779","https://openalex.org/W3132870970","https://openalex.org/W2943088381","https://openalex.org/W4296125805","https://openalex.org/W4385804830","https://openalex.org/W2144348063","https://openalex.org/W2074021203","https://openalex.org/W1982579475"],"abstract_inverted_index":{"The":[0],"massive":[1],"volumes":[2],"of":[3,22,45,108,116,127,157,164,175,197,204,245],"data":[4,20],"in":[5,34],"biological":[6,15,97,184],"sequence":[7,148,206,247],"databases":[8],"provide":[9],"a":[10,26,42,113,136,154,183,201],"remarkable":[11],"resource":[12],"for":[13,61,74,99,112,147,239,241,250],"large-scale":[14],"studies.":[16],"However,":[17],"the":[18,125,142,158,171,176,195,205,213,246],"underlying":[19],"quality":[21],"these":[23,198],"resources":[24],"is":[25,32],"critical":[27],"concern.":[28],"A":[29],"particular":[30],"challenge":[31],"duplication,":[33],"which":[35],"multiple":[36],"records":[37],"have":[38,92,120],"similar":[39],"sequences,":[40],"creating":[41],"high":[43,114],"level":[44],"redundancy":[46,159],"that":[47,187,212],"impacts":[48],"database":[49,55,62,75,100,149],"storage,":[50],"curation,":[51,63],"and":[52,73,140,173,182,217,227],"search.":[53],"Biological":[54],"deduplication":[56],"has":[57],"two":[58,143],"direct":[59],"applications:":[60],"where":[64,77],"detected":[65,78],"duplicates":[66],"are":[67,225,231],"removed":[68],"to":[69,87,96,169,193,236],"improve":[70],"curation":[71],"efficiency,":[72],"search,":[76],"duplicate":[79],"sequences":[80,98,109],"may":[81],"be":[82],"flagged":[83],"but":[84],"remain":[85],"available":[86],"support":[88],"analysis.":[89],"Clustering":[90],"methods":[91],"been":[93,121],"widely":[94],"applied":[95],"deduplication.":[101,150,251],"Since":[102],"an":[103],"exhaustive":[104],"all-by-all":[105],"pairwise":[106],"comparison":[107,137],"cannot":[110],"scale":[111],"volume":[115],"data,":[117],"heuristic":[118],"approaches":[119],"recruited,":[122],"such":[123],"as":[124],"use":[126],"simple":[128],"similarity":[129],"thresholds.":[130],"In":[131],"this":[132],"article,":[133],"we":[134],"present":[135],"between":[138,215],"CD-HIT":[139],"UCLUST,":[141],"best-known":[144],"clustering":[145,166,207,248],"tools":[146,249],"Our":[151,209],"contributions":[152],"include":[153],"detailed":[155],"assessment":[156],"remaining":[160],"after":[161],"deduplication,":[162],"application":[163,203],"standard":[165],"evaluation":[167,234],"metrics":[168],"quantify":[170],"cohesion":[172],"separation":[174],"clusters":[177],"generated":[178],"by":[179],"each":[180],"method,":[181],"case":[185],"study":[186],"assesses":[188],"intracluster":[189],"function":[190],"annotation":[191],"consistency":[192],"demonstrate":[194],"impact":[196],"factors":[199],"on":[200],"practical":[202,237],"methods.":[208],"results":[210],"show":[211],"trade-off":[214],"efficiency":[216],"accuracy":[218],"becomes":[219],"acute":[220],"when":[221,228],"low":[222],"threshold":[223],"values":[224],"used":[226],"cluster":[229],"sizes":[230],"large.":[232],"This":[233],"leads":[235],"recommendations":[238],"users":[240],"more":[242],"effective":[243],"uses":[244]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2},{"year":2021,"cited_by_count":6},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":1}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
