{"id":"https://openalex.org/W7126071851","doi":"https://doi.org/10.1109/bibm66473.2025.11356063","title":"RabbitTClust2: Fast, Scalable, and Versatile Clustering for Massive Genomic Datasets","display_name":"RabbitTClust2: Fast, Scalable, and Versatile Clustering for Massive Genomic Datasets","publication_year":2025,"publication_date":"2025-12-15","ids":{"openalex":"https://openalex.org/W7126071851","doi":"https://doi.org/10.1109/bibm66473.2025.11356063"},"language":null,"primary_location":{"id":"doi:10.1109/bibm66473.2025.11356063","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bibm66473.2025.11356063","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124293905","display_name":"Tong Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Tong Zhang","raw_affiliation_strings":["School of Software, Shandong University,Jinan,China"],"affiliations":[{"raw_affiliation_string":"School of Software, Shandong University,Jinan,China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124207793","display_name":"Xiaoming Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoming Xu","raw_affiliation_strings":["Tsinghua University,Department of Computer Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Computer Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091033464","display_name":"Zekun Yin","orcid":"https://orcid.org/0000-0001-6002-0028"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zekun Yin","raw_affiliation_strings":["School of Software, Shandong University,Jinan,China"],"affiliations":[{"raw_affiliation_string":"School of Software, Shandong University,Jinan,China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072639624","display_name":"Lifeng Yan","orcid":"https://orcid.org/0000-0001-9610-1268"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lifeng Yan","raw_affiliation_strings":["School of Software, Shandong University,Jinan,China"],"affiliations":[{"raw_affiliation_string":"School of Software, Shandong University,Jinan,China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124166537","display_name":"Yang Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Yang","raw_affiliation_strings":["School of Software, Shandong University,Jinan,China"],"affiliations":[{"raw_affiliation_string":"School of Software, Shandong University,Jinan,China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124271169","display_name":"Yijie Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yijie Gao","raw_affiliation_strings":["School of Software, Shandong University,Jinan,China"],"affiliations":[{"raw_affiliation_string":"School of Software, Shandong University,Jinan,China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124232249","display_name":"Xiaohui Duan","orcid":null},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaohui Duan","raw_affiliation_strings":["School of Software, Shandong University,Jinan,China"],"affiliations":[{"raw_affiliation_string":"School of Software, Shandong University,Jinan,China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112780307","display_name":"Bertil Schmidt","orcid":null},"institutions":[{"id":"https://openalex.org/I197323543","display_name":"Johannes Gutenberg University Mainz","ror":"https://ror.org/023b0x485","country_code":"DE","type":"education","lineage":["https://openalex.org/I197323543"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Bertil Schmidt","raw_affiliation_strings":["Institute for Computer Science, Johannes Gutenberg University,Mainz,Germany"],"affiliations":[{"raw_affiliation_string":"Institute for Computer Science, Johannes Gutenberg University,Mainz,Germany","institution_ids":["https://openalex.org/I197323543"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5045962522","display_name":"Weiguo Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiguo Liu","raw_affiliation_strings":["School of Software, Shandong University,Jinan,China"],"affiliations":[{"raw_affiliation_string":"School of Software, Shandong University,Jinan,China","institution_ids":["https://openalex.org/I154099455"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5124293905"],"corresponding_institution_ids":["https://openalex.org/I154099455"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.67226783,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1495","last_page":"1502"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.4034000039100647,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.4034000039100647,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10066","display_name":"Gut microbiota and health","score":0.1274999976158142,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10885","display_name":"Gene expression and cancer classification","score":0.06930000334978104,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.8442000150680542},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6407999992370605},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5981000065803528},{"id":"https://openalex.org/keywords/correlation-clustering","display_name":"Correlation clustering","score":0.5626999735832214},{"id":"https://openalex.org/keywords/cure-data-clustering-algorithm","display_name":"CURE data clustering algorithm","score":0.5389000177383423},{"id":"https://openalex.org/keywords/brown-clustering","display_name":"Brown clustering","score":0.5006999969482422},{"id":"https://openalex.org/keywords/canopy-clustering-algorithm","display_name":"Canopy clustering algorithm","score":0.46860000491142273},{"id":"https://openalex.org/keywords/refseq","display_name":"RefSeq","score":0.46050000190734863},{"id":"https://openalex.org/keywords/clustering-high-dimensional-data","display_name":"Clustering high-dimensional data","score":0.45840001106262207}],"concepts":[{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.8442000150680542},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7584999799728394},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6546000242233276},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6407999992370605},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5981000065803528},{"id":"https://openalex.org/C94641424","wikidata":"https://www.wikidata.org/wiki/Q5172845","display_name":"Correlation clustering","level":3,"score":0.5626999735832214},{"id":"https://openalex.org/C33704608","wikidata":"https://www.wikidata.org/wiki/Q5014717","display_name":"CURE data clustering algorithm","level":4,"score":0.5389000177383423},{"id":"https://openalex.org/C167984511","wikidata":"https://www.wikidata.org/wiki/Q17003931","display_name":"Brown clustering","level":5,"score":0.5006999969482422},{"id":"https://openalex.org/C104047586","wikidata":"https://www.wikidata.org/wiki/Q5033439","display_name":"Canopy clustering algorithm","level":4,"score":0.46860000491142273},{"id":"https://openalex.org/C151810110","wikidata":"https://www.wikidata.org/wiki/Q7307074","display_name":"RefSeq","level":4,"score":0.46050000190734863},{"id":"https://openalex.org/C184509293","wikidata":"https://www.wikidata.org/wiki/Q5136711","display_name":"Clustering high-dimensional data","level":3,"score":0.45840001106262207},{"id":"https://openalex.org/C17212007","wikidata":"https://www.wikidata.org/wiki/Q5511111","display_name":"Fuzzy clustering","level":3,"score":0.4528999924659729},{"id":"https://openalex.org/C22648726","wikidata":"https://www.wikidata.org/wiki/Q7523744","display_name":"Single-linkage clustering","level":5,"score":0.43220001459121704},{"id":"https://openalex.org/C186767784","wikidata":"https://www.wikidata.org/wiki/Q5162841","display_name":"Consensus clustering","level":5,"score":0.4020000100135803},{"id":"https://openalex.org/C193143536","wikidata":"https://www.wikidata.org/wiki/Q5227360","display_name":"Data stream clustering","level":5,"score":0.38089999556541443},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3626999855041504},{"id":"https://openalex.org/C113174947","wikidata":"https://www.wikidata.org/wiki/Q2859736","display_name":"Tree (set theory)","level":2,"score":0.33329999446868896},{"id":"https://openalex.org/C3020077223","wikidata":"https://www.wikidata.org/wiki/Q222046","display_name":"Genomic information","level":4,"score":0.32679998874664307},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.31700000166893005},{"id":"https://openalex.org/C141231307","wikidata":"https://www.wikidata.org/wiki/Q7020","display_name":"Genome","level":3,"score":0.3140999972820282},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3059000074863434},{"id":"https://openalex.org/C144817290","wikidata":"https://www.wikidata.org/wiki/Q2976575","display_name":"Biclustering","level":5,"score":0.3046000003814697},{"id":"https://openalex.org/C27964816","wikidata":"https://www.wikidata.org/wiki/Q5164359","display_name":"Constrained clustering","level":5,"score":0.29670000076293945},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.26409998536109924},{"id":"https://openalex.org/C177937566","wikidata":"https://www.wikidata.org/wiki/Q4223102","display_name":"Document clustering","level":3,"score":0.26080000400543213},{"id":"https://openalex.org/C39235581","wikidata":"https://www.wikidata.org/wiki/Q5158434","display_name":"Conceptual clustering","level":5,"score":0.26030001044273376},{"id":"https://openalex.org/C82261393","wikidata":"https://www.wikidata.org/wiki/Q17038699","display_name":"Hierarchical clustering of networks","level":5,"score":0.25999999046325684},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25780001282691956}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bibm66473.2025.11356063","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bibm66473.2025.11356063","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W2087064593","https://openalex.org/W2150926065","https://openalex.org/W2160681728","https://openalex.org/W2170747616","https://openalex.org/W2585711475","https://openalex.org/W2761430568","https://openalex.org/W2950150251","https://openalex.org/W2951254987","https://openalex.org/W2953008890","https://openalex.org/W2972805712","https://openalex.org/W2999958350","https://openalex.org/W3137871957","https://openalex.org/W4281753295","https://openalex.org/W4377011795","https://openalex.org/W4385897776","https://openalex.org/W4388721877","https://openalex.org/W4404225843","https://openalex.org/W4409867982","https://openalex.org/W4412404067"],"related_works":[],"abstract_inverted_index":{"Clustering":[0],"is":[1,65,123,178,298],"a":[2,87,105,148,165,173,213,222,227,242,289],"fundamental":[3],"method":[4],"for":[5,53,69,94,115,292],"extracting":[6],"meaningful":[7],"information":[8],"from":[9,252],"large-scale":[10,31,96],"genomic":[11,97],"datasets.":[12],"As":[13],"sequencing":[14],"technologies":[15],"advance,":[16],"efficient":[17,89,102,196],"and":[18,90,113,199],"scalable":[19],"clustering":[20,33,76,95,162,181,198,206,245,275],"tools":[21],"have":[22],"become":[23],"increasingly":[24],"important.":[25],"Despite":[26],"its":[27,57],"outstanding":[28],"efficiency":[29],"in":[30,137,169,185,226,295],"genome":[32],"tasks,":[34],"RabbitTClust":[35,64],"still":[36],"faces":[37],"certain":[38],"limitations.":[39],"On":[40,60,172],"the":[41,61,127,265,274,279],"one":[42,187],"hand,":[43,63],"as":[44],"data":[45,72],"volumes":[46],"continue":[47],"to":[48,125,143,153,207,240,254],"grow,":[49],"there":[50],"remains":[51],"room":[52],"further":[54],"optimization":[55],"of":[56,180,216,244,258],"computational":[58],"performance.":[59],"other":[62],"not":[66],"well":[67],"suited":[68],"frequent":[70],"incremental":[71,197,205],"updates":[73],"or":[74],"fast":[75],"across":[77],"multiple":[78],"thresholds.":[79],"To":[80],"address":[81],"these":[82,120],"limitations,":[83],"we":[84,237],"introduce":[85],"RabbitTClust2,":[86],"highly":[88],"versatile":[91],"tool":[92],"designed":[93],"sequences.":[98],"RabbitTClust2":[99,122,146,177,190,203,239,297],"integrates":[100],"an":[101],"sketching":[103],"algorithm,":[104],"pruningand":[106],"inverted-index-based":[107],"minimum":[108],"spanning":[109],"tree":[110],"construction":[111],"method,":[112],"strategies":[114],"reusing":[116],"intermediate":[117],"results.":[118],"With":[119],"advancements,":[121],"able":[124],"cluster":[126,280],"latest":[128],"RefSeq":[129,266],"bacterial":[130,267],"dataset":[131,215],"(195":[132],"k":[133],"genomes,":[134],"820":[135],"GB":[136],"FASTA)":[138],"within":[139,219,261],"5":[140],"minutes.":[141],"Compared":[142],"previous":[144],"versions,":[145],"achieves":[147],"<tex":[149,154,228],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[150,155,229],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$2.4":[151],"\\times$</tex>":[152,157,231],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$4.5":[156],"speedup":[158,232],"while":[159],"maintaining":[160],"comparable":[161],"accuracy,":[163],"with":[164,247],"21":[166],"%":[167],"reduction":[168],"memory":[170],"consumption.":[171],"distributed":[174],"multi-node":[175],"platform,":[176],"capable":[179],"2.6":[182],"million":[183],"genomes":[184,218],"approximately":[186],"hour.":[188],"Furthermore,":[189],"offers":[191],"significant":[192],"versatility":[193],"by":[194],"supporting":[195],"rapid":[200],"multithreshold":[201],"analysis.":[202],"utilizes":[204],"integrate":[208],"1,000":[209],"new":[210],"sequences":[211],"into":[212],"pre-clustered":[214],"194,000":[217],"1":[220],"minute,":[221],"process":[223],"that":[224,272,285],"results":[225,246,270],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$22.7":[230],"over":[233],"RabbitTClust.":[234],"In":[235],"addition,":[236],"used":[238],"generate":[241],"series":[243],"Mash":[248],"distance":[249],"thresholds":[250],"ranging":[251],"0.01":[253],"0.2":[255],"(a":[256],"total":[257],"20":[259],"values)":[260],"7":[262],"minutes":[263],"on":[264],"dataset.":[268],"The":[269],"showed":[271],"when":[273],"threshold":[276,291],"approached":[277],"0.1,":[278],"compositions":[281],"changed":[282],"significantly,":[283],"suggesting":[284],"0.1":[286],"may":[287],"represent":[288],"critical":[290],"genuslevel":[293],"classification":[294],"bacteria.":[296],"available":[299],"at":[300],"https://github.com/RabbitBio/RabbitTClust.":[301]},"counts_by_year":[],"updated_date":"2026-02-01T03:34:12.195049","created_date":"2026-01-30T00:00:00"}
