{"id":"https://openalex.org/W7162796272","doi":"https://doi.org/10.48550/arxiv.2605.28868","title":"TaxDistill: Improving Metagenomic Taxonomic Annotation via Distilled Genomic Foundation Models","display_name":"TaxDistill: Improving Metagenomic Taxonomic Annotation via Distilled Genomic Foundation Models","publication_year":2026,"publication_date":"2026-05-22","ids":{"openalex":"https://openalex.org/W7162796272","doi":"https://doi.org/10.48550/arxiv.2605.28868"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.28868","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28868","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.28868","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5070256623","display_name":"Rongye Ye","orcid":"https://orcid.org/0009-0002-5528-965X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Rongye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137399241","display_name":"Lun Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137311410","display_name":"Zheng Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Zheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137375798","display_name":"Yiran Zhan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhan, Yiran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137357428","display_name":"Shuhui Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Shuhui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.5863000154495239,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.5863000154495239,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.21160000562667847,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10066","display_name":"Gut microbiota and health","score":0.05860000103712082,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/metagenomics","display_name":"Metagenomics","score":0.9124000072479248},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.7272999882698059},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5253999829292297},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.43459999561309814},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.38839998841285706},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.38269999623298645},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.36719998717308044}],"concepts":[{"id":"https://openalex.org/C15151743","wikidata":"https://www.wikidata.org/wiki/Q903778","display_name":"Metagenomics","level":3,"score":0.9124000072479248},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.7272999882698059},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6444000005722046},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5575000047683716},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5253999829292297},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5102999806404114},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.43459999561309814},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4296000003814697},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.38839998841285706},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.38269999623298645},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.36719998717308044},{"id":"https://openalex.org/C189592816","wikidata":"https://www.wikidata.org/wiki/Q427626","display_name":"Taxonomic rank","level":3,"score":0.35740000009536743},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3368000090122223},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.33480000495910645},{"id":"https://openalex.org/C48702757","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Biological classification","level":2,"score":0.2971000075340271},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2964000105857849},{"id":"https://openalex.org/C70721500","wikidata":"https://www.wikidata.org/wiki/Q177005","display_name":"Computational biology","level":1,"score":0.2777999937534332},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C2779969263","wikidata":"https://www.wikidata.org/wiki/Q25098939","display_name":"Environmental DNA","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C189206191","wikidata":"https://www.wikidata.org/wiki/Q222046","display_name":"Genomics","level":4,"score":0.26190000772476196}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.28868","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28868","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.28868","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28868","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7005807757377625,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Metagenomic":[0],"taxonomic":[1],"annotation":[2],"aims":[3],"to":[4,52,113,178],"identify":[5],"the":[6,26,31,39,110,139,166,171,181],"microbial":[7,28],"origins":[8],"of":[9,33,41,174],"DNA":[10],"fragments":[11],"in":[12,160,193],"environmental":[13],"samples.":[14],"Traditional":[15],"methods":[16,61],"that":[17,47,77,155],"rely":[18,63],"on":[19,64,123,149,165],"sequence":[20,57],"similarity":[21,68],"are":[22],"often":[23],"constrained":[24],"by":[25,143],"high":[27],"diversity":[29],"and":[30,82,118],"incompleteness":[32],"reference":[34],"databases,":[35],"which":[36,73],"has":[37],"motivated":[38],"development":[40],"learning":[42,81],"approaches":[43],"such":[44],"as":[45,109],"Taxometer":[46,182],"perform":[48],"post":[49],"hoc":[50],"correction":[51,192],"learn":[53],"more":[54],"informative":[55],"metagenomic":[56,98,195],"representations.":[58],"However,":[59],"these":[60],"typically":[62],"labels":[65,121],"derived":[66],"from":[67,176],"search":[69],"tools":[70],"during":[71],"training,":[72],"inevitably":[74],"introduces":[75],"noise":[76,141],"can":[78],"impair":[79],"representation":[80],"degrade":[83],"classification":[84],"performance.":[85],"To":[86],"address":[87],"this":[88,127],"issue,":[89],"we":[90],"propose":[91],"TaxDistill,":[92],"a":[93,103,132,187],"knowledge":[94],"distillation":[95],"framework":[96],"for":[97,190],"classification.":[99],"We":[100],"introduce":[101],"GenomeOcean,":[102],"500M":[104],"parameter":[105],"genomic":[106],"foundation":[107],"model,":[108],"teacher":[111],"network":[112],"extract":[114],"deep":[115],"semantic":[116],"features":[117],"generate":[119],"soft":[120,128],"based":[122],"confidence.":[124],"By":[125],"distilling":[126],"label":[129,140,191],"information":[130],"into":[131],"lightweight":[133],"student":[134],"network,":[135],"TaxDistill":[136,156,185],"effectively":[137],"reduces":[138],"introduced":[142],"initial":[144],"retrieval":[145],"tools.":[146],"Comprehensive":[147],"experiments":[148],"seven":[150],"diverse":[151],"CAMI2":[152],"datasets":[153],"demonstrate":[154],"outperforms":[157],"existing":[158],"baselines":[159],"most":[161],"scenarios.":[162],"For":[163],"instance,":[164],"Gastrointestinal":[167],"dataset,":[168],"it":[169],"improves":[170],"F1":[172],"score":[173],"MMseqs2":[175],"0.763":[177],"0.941,":[179],"outperforming":[180],"baseline.":[183],"Overall,":[184],"provides":[186],"reliable":[188],"method":[189],"complex":[194],"analysis.":[196]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-30T00:00:00"}
