{"id":"https://openalex.org/W4411174340","doi":"https://doi.org/10.1186/s13321-025-01039-8","title":"UMAP-based clustering split for rigorous evaluation of AI models for virtual screening on cancer cell lines*","display_name":"UMAP-based clustering split for rigorous evaluation of AI models for virtual screening on cancer cell lines*","publication_year":2025,"publication_date":"2025-06-10","ids":{"openalex":"https://openalex.org/W4411174340","doi":"https://doi.org/10.1186/s13321-025-01039-8","pmid":"https://pubmed.ncbi.nlm.nih.gov/40495205"},"language":"en","primary_location":{"id":"doi:10.1186/s13321-025-01039-8","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13321-025-01039-8","pdf_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-01039-8","source":{"id":"https://openalex.org/S180838163","display_name":"Journal of Cheminformatics","issn_l":"1758-2946","issn":["1758-2946"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Cheminformatics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-01039-8","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102683327","display_name":"Qianrong Guo","orcid":"https://orcid.org/0009-0008-8705-6133"},"institutions":[{"id":"https://openalex.org/I47508984","display_name":"Imperial College London","ror":"https://ror.org/041kmwe10","country_code":"GB","type":"education","lineage":["https://openalex.org/I47508984"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Qianrong Guo","raw_affiliation_strings":["Department of Bioengineering, Imperial College London, London, SW7 2AZ, UK"],"affiliations":[{"raw_affiliation_string":"Department of Bioengineering, Imperial College London, London, SW7 2AZ, UK","institution_ids":["https://openalex.org/I47508984"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004310687","display_name":"Saiveth Hern\u00e1ndez-Hern\u00e1ndez","orcid":null},"institutions":[{"id":"https://openalex.org/I47508984","display_name":"Imperial College London","ror":"https://ror.org/041kmwe10","country_code":"GB","type":"education","lineage":["https://openalex.org/I47508984"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Saiveth Hernandez-Hernandez","raw_affiliation_strings":["Department of Bioengineering, Imperial College London, London, SW7 2AZ, UK"],"affiliations":[{"raw_affiliation_string":"Department of Bioengineering, Imperial College London, London, SW7 2AZ, UK","institution_ids":["https://openalex.org/I47508984"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5005597587","display_name":"Pedro J. Ballester","orcid":"https://orcid.org/0000-0002-4078-743X"},"institutions":[{"id":"https://openalex.org/I47508984","display_name":"Imperial College London","ror":"https://ror.org/041kmwe10","country_code":"GB","type":"education","lineage":["https://openalex.org/I47508984"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Pedro J. Ballester","raw_affiliation_strings":["Department of Bioengineering, Imperial College London, London, SW7 2AZ, UK. p.ballester@imperial.ac.uk"],"affiliations":[{"raw_affiliation_string":"Department of Bioengineering, Imperial College London, London, SW7 2AZ, UK. p.ballester@imperial.ac.uk","institution_ids":["https://openalex.org/I47508984"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5102683327"],"corresponding_institution_ids":["https://openalex.org/I47508984"],"apc_list":{"value":1290,"currency":"GBP","value_usd":1582},"apc_paid":{"value":1290,"currency":"GBP","value_usd":1582},"fwci":28.3642,"has_fulltext":true,"cited_by_count":13,"citation_normalized_percentile":{"value":0.99520515,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":"17","issue":"1","first_page":"94","last_page":"94"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10862","display_name":"AI in cancer detection","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10862","display_name":"AI in cancer detection","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.993399977684021,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10887","display_name":"Bioinformatics and Genomic Networks","score":0.9905999898910522,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7136534452438354},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.661939263343811},{"id":"https://openalex.org/keywords/virtual-screening","display_name":"Virtual screening","score":0.5526161193847656},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.4931192100048065},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.32796376943588257},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.32532042264938354},{"id":"https://openalex.org/keywords/bioinformatics","display_name":"Bioinformatics","score":0.20187360048294067},{"id":"https://openalex.org/keywords/drug-discovery","display_name":"Drug discovery","score":0.12847694754600525}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7136534452438354},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.661939263343811},{"id":"https://openalex.org/C103697762","wikidata":"https://www.wikidata.org/wiki/Q4112105","display_name":"Virtual screening","level":3,"score":0.5526161193847656},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4931192100048065},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.32796376943588257},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32532042264938354},{"id":"https://openalex.org/C60644358","wikidata":"https://www.wikidata.org/wiki/Q128570","display_name":"Bioinformatics","level":1,"score":0.20187360048294067},{"id":"https://openalex.org/C74187038","wikidata":"https://www.wikidata.org/wiki/Q1418791","display_name":"Drug discovery","level":2,"score":0.12847694754600525},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1186/s13321-025-01039-8","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13321-025-01039-8","pdf_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-01039-8","source":{"id":"https://openalex.org/S180838163","display_name":"Journal of Cheminformatics","issn_l":"1758-2946","issn":["1758-2946"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Cheminformatics","raw_type":"journal-article"},{"id":"pmid:40495205","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40495205","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of cheminformatics","raw_type":null},{"id":"pmh:oai:doaj.org/article:88dd833ed7d74fa7993bbc0371d8fe49","is_oa":true,"landing_page_url":"https://doaj.org/article/88dd833ed7d74fa7993bbc0371d8fe49","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Journal of Cheminformatics, Vol 17, Iss 1, Pp 1-18 (2025)","raw_type":"article"},{"id":"pmh:oai:pubmedcentral.nih.gov:12153141","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/12153141","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"J Cheminform","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1186/s13321-025-01039-8","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13321-025-01039-8","pdf_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-01039-8","source":{"id":"https://openalex.org/S180838163","display_name":"Journal of Cheminformatics","issn_l":"1758-2946","issn":["1758-2946"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Cheminformatics","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320320006","display_name":"Royal Society","ror":"https://ror.org/03wnrjx87"},{"id":"https://openalex.org/F4320320283","display_name":"Imperial College London","ror":"https://ror.org/041kmwe10"},{"id":"https://openalex.org/F4320320670","display_name":"Wolfson Foundation","ror":"https://ror.org/0333xzh65"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4411174340.pdf","grobid_xml":"https://content.openalex.org/works/W4411174340.grobid-xml"},"referenced_works_count":56,"referenced_works":["https://openalex.org/W1757990252","https://openalex.org/W1984994707","https://openalex.org/W2004638738","https://openalex.org/W2042110087","https://openalex.org/W2060531713","https://openalex.org/W2087312216","https://openalex.org/W2104709519","https://openalex.org/W2113668783","https://openalex.org/W2129438079","https://openalex.org/W2129860849","https://openalex.org/W2265551258","https://openalex.org/W2594183968","https://openalex.org/W2754936311","https://openalex.org/W2806547269","https://openalex.org/W2920995682","https://openalex.org/W2922063386","https://openalex.org/W2937307539","https://openalex.org/W2946834067","https://openalex.org/W2982751856","https://openalex.org/W3035777330","https://openalex.org/W3037624696","https://openalex.org/W3043757685","https://openalex.org/W3088265803","https://openalex.org/W3094640617","https://openalex.org/W3094771832","https://openalex.org/W3099354564","https://openalex.org/W3137440015","https://openalex.org/W3200762293","https://openalex.org/W4210377205","https://openalex.org/W4210617107","https://openalex.org/W4213077304","https://openalex.org/W4214656930","https://openalex.org/W4214868967","https://openalex.org/W4286487916","https://openalex.org/W4291448287","https://openalex.org/W4309740782","https://openalex.org/W4320857994","https://openalex.org/W4323565291","https://openalex.org/W4367049415","https://openalex.org/W4378212018","https://openalex.org/W4385323450","https://openalex.org/W4386295701","https://openalex.org/W4387673203","https://openalex.org/W4388455891","https://openalex.org/W4389617734","https://openalex.org/W4389991855","https://openalex.org/W4391431291","https://openalex.org/W4391508920","https://openalex.org/W4396521393","https://openalex.org/W4402082179","https://openalex.org/W4402207565","https://openalex.org/W4402604262","https://openalex.org/W4402816264","https://openalex.org/W4405183894","https://openalex.org/W4409801681","https://openalex.org/W4410790937"],"related_works":["https://openalex.org/W3180887190","https://openalex.org/W2320211095","https://openalex.org/W2308164565","https://openalex.org/W2062768126","https://openalex.org/W2059230675","https://openalex.org/W3003855012","https://openalex.org/W2347569454","https://openalex.org/W4241018400","https://openalex.org/W4211208557","https://openalex.org/W1566373881"],"abstract_inverted_index":{"Virtual":[0],"Screening":[1],"(VS)":[2,317],"of":[3,29,54,164,208,311],"large":[4],"compound":[5],"libraries":[6,56],"using":[7,204],"Artificial":[8],"Intelligence":[9],"(AI)":[10],"models":[11,114,166,292],"is":[12,23,230],"a":[13,162,265,285],"highly":[14],"effective":[15],"approach":[16],"for":[17,25,186,217,242,269,314,327],"early":[18],"drug":[19],"discovery.":[20],"Data":[21],"splitting":[22,171,267],"crucial":[24],"benchmarking":[26],"the":[27,52,142,258,309,325],"performance":[28],"such":[30],"AI":[31,113,291],"models.":[32],"Traditional":[33],"random":[34,199],"data":[35],"splits":[36,195,206,212,216],"often":[37],"result":[38],"in":[39,43,134,250,299],"structurally":[40,60],"similar":[41],"molecules":[42,71,81,123],"both":[44],"training":[45,99],"and":[46,76,100,141,148,158,167,183,196,213,240,248,279],"test":[47,101],"sets,":[48,102],"which":[49,69,79],"conflict":[50],"with":[51,231],"reality":[53],"VS":[55,232],"that":[57,91,177],"typically":[58],"contain":[59],"diverse":[61],"compounds.":[62],"To":[63],"tackle":[64],"this":[65],"challenge,":[66],"scaffold":[67,194,215,281],"split,":[68],"groups":[70],"by":[72,82,190,260],"shared":[73],"core":[74],"structure,":[75],"Butina":[77,139,191,211,277],"clustering,":[78],"clusters":[80],"chemotypes,":[83],"have":[84],"long":[85],"been":[86],"used.":[87],"However,":[88],"we":[89,160,202,224],"show":[90,176,306],"these":[92],"methods":[93,275],"still":[94],"introduce":[95],"high":[96],"similarities":[97],"between":[98],"leading":[103],"to":[104,289,319],"overestimated":[105],"model":[106,187],"performance.":[107],"Our":[108],"study":[109],"examined":[110],"four":[111,135,170],"representative":[112],"across":[115],"60":[116],"NCI-60":[117],"datasets,":[118,271],"each":[119],"comprising":[120],"approximately":[121],"33,000-54,000":[122],"tested":[124],"on":[125],"different":[126],"cancer":[127],"cell":[128],"lines.":[129],"Each":[130],"dataset":[131],"was":[132],"split":[133,179],"ways:":[136],"random,":[137],"scaffold,":[138],"clustering":[140,263,278],"more":[143,181,294],"realistic":[144,184,295],"Uniform":[145],"Manifold":[146],"Approximation":[147],"Projection":[149],"(UMAP)":[150],"clustering.":[151],"Using":[152],"Linear":[153],"Regression,":[154],"Random":[155],"Forest,":[156],"Transformer-CNN,":[157],"GEM,":[159],"trained":[161],"total":[163],"8400":[165],"evaluated":[168],"under":[169,293],"methods.":[172],"These":[173],"comprehensive":[174],"results":[175],"UMAP":[178,205,262],"provides":[180],"challenging":[182],"benchmarks":[185],"evaluation,":[188],"followed":[189],"splits,":[192],"then":[193],"closely":[197],"after":[198],"splits.":[200,282],"Consequently,":[201],"recommend":[203],"instead":[207],"overly":[209],"optimistic":[210],"especially":[214,280],"molecular":[218,270,300],"property":[219,301],"prediction,":[220],"including":[221],"VS.":[222],"Lastly,":[223],"illustrate":[225],"how":[226,307],"misaligned":[227],"ROC":[228,312],"AUC":[229,313],"goals,":[233],"despite":[234,321],"its":[235,322],"common":[236],"use.":[237],"The":[238,303],"code":[239],"datasets":[241],"reproducibility":[243],"are":[244],"available":[245],"at":[246],"https://github.com/Rong830/UMAP_split_for_VS":[247],"archived":[249],"https://zenodo.org/records/14736486":[251],".":[252],"Scientific":[253],"contribution":[254],"This":[255],"work":[256],"advances":[257],"field":[259],"introducing":[261],"as":[264],"robust":[266],"method":[268],"improving":[272],"over":[273],"traditional":[274],"like":[276],"It":[283],"offers":[284],"new":[286],"evaluation":[287,329],"framework":[288],"benchmark":[290],"conditions,":[296],"fostering":[297],"progress":[298],"prediction.":[302],"findings":[304],"also":[305],"inappropriate":[308],"use":[310],"virtual":[315],"screening":[316],"continues":[318],"be,":[320],"popularity,":[323],"emphasizing":[324],"need":[326],"context-specific":[328],"metrics.":[330]},"counts_by_year":[{"year":2026,"cited_by_count":7},{"year":2025,"cited_by_count":6}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
