{"id":"https://openalex.org/W7127177925","doi":"https://doi.org/10.1021/acs.jcim.5c02465","title":"Toward More Trustworthy QSAR: A Systematic Discussion on Data Set Partitioning","display_name":"Toward More Trustworthy QSAR: A Systematic Discussion on Data Set Partitioning","publication_year":2026,"publication_date":"2026-02-02","ids":{"openalex":"https://openalex.org/W7127177925","doi":"https://doi.org/10.1021/acs.jcim.5c02465","pmid":"https://pubmed.ncbi.nlm.nih.gov/41628301"},"language":"en","primary_location":{"id":"doi:10.1021/acs.jcim.5c02465","is_oa":false,"landing_page_url":"https://doi.org/10.1021/acs.jcim.5c02465","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048773570","display_name":"Shangyu Li","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shangyu Li","raw_affiliation_strings":["School of Environmental Science and Engineering","Tianjin University"],"raw_orcid":"https://orcid.org/0009-0006-8546-485X","affiliations":[{"raw_affiliation_string":"School of Environmental Science and Engineering","institution_ids":[]},{"raw_affiliation_string":"Tianjin University","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124777532","display_name":"Peizhe Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Peizhe Sun","raw_affiliation_strings":["School of Environmental Science and Engineering","Tianjin University"],"raw_orcid":"https://orcid.org/0000-0001-7538-5345","affiliations":[{"raw_affiliation_string":"School of Environmental Science and Engineering","institution_ids":[]},{"raw_affiliation_string":"Tianjin University","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5124777532"],"corresponding_institution_ids":["https://openalex.org/I162868743"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.16987389,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"66","issue":"4","first_page":"2199","last_page":"2210"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.3483000099658966,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.3483000099658966,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.1234000027179718,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.11630000174045563,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalizability-theory","display_name":"Generalizability theory","score":0.9075000286102295},{"id":"https://openalex.org/keywords/comparability","display_name":"Comparability","score":0.8198999762535095},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6388999819755554},{"id":"https://openalex.org/keywords/trustworthiness","display_name":"Trustworthiness","score":0.5002999901771545},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.4767000079154968},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.4722000062465668},{"id":"https://openalex.org/keywords/internal-validity","display_name":"Internal validity","score":0.4708999991416931},{"id":"https://openalex.org/keywords/external-validity","display_name":"External validity","score":0.3614000082015991}],"concepts":[{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.9075000286102295},{"id":"https://openalex.org/C197947376","wikidata":"https://www.wikidata.org/wiki/Q5155608","display_name":"Comparability","level":2,"score":0.8198999762535095},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6888999938964844},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6388999819755554},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5306000113487244},{"id":"https://openalex.org/C153701036","wikidata":"https://www.wikidata.org/wiki/Q659974","display_name":"Trustworthiness","level":2,"score":0.5002999901771545},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4767000079154968},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.4722000062465668},{"id":"https://openalex.org/C51082289","wikidata":"https://www.wikidata.org/wiki/Q4113800","display_name":"Internal validity","level":2,"score":0.4708999991416931},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38199999928474426},{"id":"https://openalex.org/C174106493","wikidata":"https://www.wikidata.org/wiki/Q1057880","display_name":"External validity","level":2,"score":0.3614000082015991},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.3522000014781952},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.33379998803138733},{"id":"https://openalex.org/C87007009","wikidata":"https://www.wikidata.org/wiki/Q210832","display_name":"Statistical hypothesis testing","level":2,"score":0.33180001378059387},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.29820001125335693},{"id":"https://openalex.org/C169258074","wikidata":"https://www.wikidata.org/wiki/Q245748","display_name":"Random forest","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.28369998931884766},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27480000257492065},{"id":"https://openalex.org/C55037315","wikidata":"https://www.wikidata.org/wiki/Q5421151","display_name":"Experimental data","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C169903167","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Test set","level":2,"score":0.27239999175071716},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2547000050544739},{"id":"https://openalex.org/C40423286","wikidata":"https://www.wikidata.org/wiki/Q284172","display_name":"Selection bias","level":2,"score":0.25279998779296875}],"mesh":[{"descriptor_ui":"D021281","descriptor_name":"Quantitative Structure-Activity Relationship","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D021281","descriptor_name":"Quantitative Structure-Activity Relationship","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true}],"locations_count":2,"locations":[{"id":"doi:10.1021/acs.jcim.5c02465","is_oa":false,"landing_page_url":"https://doi.org/10.1021/acs.jcim.5c02465","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},{"id":"pmid:41628301","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41628301","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of chemical information and modeling","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G639706917","display_name":null,"funder_award_id":"22176141","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G980703599","display_name":null,"funder_award_id":"22322608","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":46,"referenced_works":["https://openalex.org/W1934237883","https://openalex.org/W1980628324","https://openalex.org/W1982177423","https://openalex.org/W1995495408","https://openalex.org/W1996656176","https://openalex.org/W1996851544","https://openalex.org/W2001811464","https://openalex.org/W2014059083","https://openalex.org/W2019657322","https://openalex.org/W2019678805","https://openalex.org/W2054716083","https://openalex.org/W2128245586","https://openalex.org/W2139731838","https://openalex.org/W2147084679","https://openalex.org/W2295598076","https://openalex.org/W2314437871","https://openalex.org/W2804833668","https://openalex.org/W2895420596","https://openalex.org/W2973634761","https://openalex.org/W3048787834","https://openalex.org/W3049473680","https://openalex.org/W3093997868","https://openalex.org/W3177371468","https://openalex.org/W3216511624","https://openalex.org/W4210360375","https://openalex.org/W4210373151","https://openalex.org/W4224982784","https://openalex.org/W4288586849","https://openalex.org/W4309435358","https://openalex.org/W4312066574","https://openalex.org/W4315588845","https://openalex.org/W4320497376","https://openalex.org/W4353049675","https://openalex.org/W4365457471","https://openalex.org/W4365512526","https://openalex.org/W4376632252","https://openalex.org/W4378173547","https://openalex.org/W4378190947","https://openalex.org/W4382536268","https://openalex.org/W4387564099","https://openalex.org/W4388003021","https://openalex.org/W4405644078","https://openalex.org/W4406233501","https://openalex.org/W4408350489","https://openalex.org/W4415015775","https://openalex.org/W4417269059"],"related_works":[],"abstract_inverted_index":{"With":[0],"the":[1,13,30,60,68,126,151,161,183],"surge":[2],"in":[3,94,150],"QSAR":[4],"model":[5,43,138],"development,":[6],"concerns":[7],"about":[8],"evaluation":[9],"rigor,":[10],"particularly":[11],"regarding":[12],"influence":[14],"of":[15,24,32,62,70,172,185],"data":[16,22,49,63,153,164],"splitting,":[17],"have":[18],"grown.":[19],"Using":[20],"five":[21],"sets":[23],"various":[25],"sizes,":[26],"we":[27],"systematically":[28],"assessed":[29],"effects":[31],"random":[33,71,143,210],"splits":[34,37,131],"(RS),":[35],"similarity-based":[36],"(SS),":[38],"and":[39,53,67,121,181,220],"random-seed":[40],"variability":[41,141],"on":[42,118,147,160,206],"generalizability":[44,230],"under":[45,108],"two":[46],"scenarios:":[47],"limited":[48],"for":[50,133],"chemical":[51],"screening":[52],"standard":[54],"modeling":[55],"with":[56,198],"ample":[57],"data.":[58],"Both":[59],"choice":[61],"set":[64,154,165],"partitioning":[65,218],"method":[66],"selection":[69],"seeds":[72,144,211],"can":[73,89],"substantially":[74],"affect":[75],"internal":[76,91,120,134,148,234],"test":[77,92],"performance,":[78],"which":[79],"may":[80,113],"not":[81,100],"reliably":[82],"reflect":[83],"true":[84,228],"predictive":[85],"capability.":[86],"Although":[87],"SS":[88,112],"improve":[90,137],"performance":[93,135],"many":[95],"settings,":[96],"these":[97],"gains":[98],"do":[99],"necessarily":[101],"translate":[102],"into":[103],"stronger":[104],"external":[105,122,163,229],"generalizability.":[106],"Moreover,":[107],"low":[109],"sampling":[110],"ratios,":[111],"perform":[114],"worse":[115],"than":[116],"RS":[117],"both":[119],"tests.":[123],"This":[124,177],"challenges":[125],"implicit":[127],"assumption":[128],"that":[129,192],"rational":[130,214],"optimized":[132],"universally":[136],"performance.":[139],"Notably,":[140],"across":[142],"was":[145],"high":[146],"tests":[149],"smallest":[152],"(":[155],"R":[156,166],"2:":[157],"0.453\u20130.783),":[158],"whereas":[159],"fixed":[162],"2":[167],"varied":[168],"less":[169],"(0.633\u20130.672),":[170],"regardless":[171],"applicability":[173],"domain":[174],"(AD)":[175],"filtering.":[176],"undermined":[178],"cross-study":[179],"comparability":[180],"underscored":[182],"risk":[184],"overly":[186],"optimistic":[187],"conclusions.":[188],"Our":[189],"findings":[190],"highlighted":[191],"test-set":[193],"construction":[194],"must":[195],"be":[196,224],"aligned":[197],"real-world":[199],"application":[200],"scenarios.":[201],"Researchers":[202],"should":[203,223],"avoid":[204],"relying":[205],"single":[207],"or":[208,212],"cherry-picked":[209],"unsuitable":[213],"partitioning.":[215],"Transparent,":[216],"application-aligned":[217],"protocols":[219],"AD":[221],"methods":[222],"employed":[225],"to":[226],"emphasize":[227],"over":[231],"potentially":[232],"inflated":[233],"metrics.":[235]},"counts_by_year":[],"updated_date":"2026-06-15T08:34:33.830935","created_date":"2026-02-03T00:00:00"}
