{"id":"https://openalex.org/W7126431102","doi":"https://doi.org/10.1186/s12859-026-06372-9","title":"Sample size requirements for machine learning classification of binary outcomes in bulk RNA-Seq data","display_name":"Sample size requirements for machine learning classification of binary outcomes in bulk RNA-Seq data","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7126431102","doi":"https://doi.org/10.1186/s12859-026-06372-9","pmid":"https://pubmed.ncbi.nlm.nih.gov/41620619"},"language":"en","primary_location":{"id":"doi:10.1186/s12859-026-06372-9","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s12859-026-06372-9","pdf_url":null,"source":{"id":"https://openalex.org/S19032547","display_name":"BMC Bioinformatics","issn_l":"1471-2105","issn":["1471-2105"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"BMC Bioinformatics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1186/s12859-026-06372-9","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5031290867","display_name":"Scott Silvey","orcid":null},"institutions":[{"id":"https://openalex.org/I184840846","display_name":"Virginia Commonwealth University","ror":"https://ror.org/02nkdxk79","country_code":"US","type":"education","lineage":["https://openalex.org/I184840846"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Scott Silvey","raw_affiliation_strings":["School of Public Health, Department of Biostatistics, Virginia Commonwealth University, 830 East Main Street, Richmond, VA, 23219, USA. silveys@vcu.edu"],"affiliations":[{"raw_affiliation_string":"School of Public Health, Department of Biostatistics, Virginia Commonwealth University, 830 East Main Street, Richmond, VA, 23219, USA. silveys@vcu.edu","institution_ids":["https://openalex.org/I184840846"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000376105","display_name":"Amy L. Olex","orcid":null},"institutions":[{"id":"https://openalex.org/I184840846","display_name":"Virginia Commonwealth University","ror":"https://ror.org/02nkdxk79","country_code":"US","type":"education","lineage":["https://openalex.org/I184840846"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Amy Olex","raw_affiliation_strings":["C. Kenneth and Dianne Wright Center for Clinical and Translational Research, Virginia Commonwealth University, Richmond, VA, 23298, USA"],"affiliations":[{"raw_affiliation_string":"C. Kenneth and Dianne Wright Center for Clinical and Translational Research, Virginia Commonwealth University, Richmond, VA, 23298, USA","institution_ids":["https://openalex.org/I184840846"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010291947","display_name":"Shaojun Tang","orcid":"https://orcid.org/0000-0002-5141-0515"},"institutions":[{"id":"https://openalex.org/I184840846","display_name":"Virginia Commonwealth University","ror":"https://ror.org/02nkdxk79","country_code":"US","type":"education","lineage":["https://openalex.org/I184840846"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shaojun Tang","raw_affiliation_strings":["School of Public Health, Department of Biostatistics, Virginia Commonwealth University, 830 East Main Street, Richmond, VA, 23219, USA"],"affiliations":[{"raw_affiliation_string":"School of Public Health, Department of Biostatistics, Virginia Commonwealth University, 830 East Main Street, Richmond, VA, 23219, USA","institution_ids":["https://openalex.org/I184840846"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124658415","display_name":"Jinze Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I184840846","display_name":"Virginia Commonwealth University","ror":"https://ror.org/02nkdxk79","country_code":"US","type":"education","lineage":["https://openalex.org/I184840846"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinze Liu","raw_affiliation_strings":["C. Kenneth and Dianne Wright Center for Clinical and Translational Research, Virginia Commonwealth University, Richmond, VA, 23298, USA","School of Public Health, Department of Biostatistics, Virginia Commonwealth University, 830 East Main Street, Richmond, VA, 23219, USA"],"affiliations":[{"raw_affiliation_string":"C. Kenneth and Dianne Wright Center for Clinical and Translational Research, Virginia Commonwealth University, Richmond, VA, 23298, USA","institution_ids":["https://openalex.org/I184840846"]},{"raw_affiliation_string":"School of Public Health, Department of Biostatistics, Virginia Commonwealth University, 830 East Main Street, Richmond, VA, 23219, USA","institution_ids":["https://openalex.org/I184840846"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5031290867"],"corresponding_institution_ids":["https://openalex.org/I184840846"],"apc_list":{"value":1690,"currency":"GBP","value_usd":2072},"apc_paid":{"value":1690,"currency":"GBP","value_usd":2072},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.21312878,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"27","issue":"1","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10885","display_name":"Gene expression and cancer classification","score":0.5740000009536743,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10885","display_name":"Gene expression and cancer classification","score":0.5740000009536743,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10261","display_name":"Genetic Associations and Epidemiology","score":0.07599999755620956,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11297","display_name":"Ferroptosis and cancer prognosis","score":0.03880000114440918,"subfield":{"id":"https://openalex.org/subfields/2740","display_name":"Pulmonary and Respiratory Medicine"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sample-size-determination","display_name":"Sample size determination","score":0.7512999773025513},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.5674999952316284},{"id":"https://openalex.org/keywords/random-forest","display_name":"Random forest","score":0.5440000295639038},{"id":"https://openalex.org/keywords/bayesian-probability","display_name":"Bayesian probability","score":0.4871000051498413},{"id":"https://openalex.org/keywords/support-vector-machine","display_name":"Support vector machine","score":0.4729999899864197},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4724000096321106},{"id":"https://openalex.org/keywords/regression","display_name":"Regression","score":0.4456000030040741},{"id":"https://openalex.org/keywords/binary-classification","display_name":"Binary classification","score":0.4302999973297119},{"id":"https://openalex.org/keywords/regression-analysis","display_name":"Regression analysis","score":0.36340001225471497}],"concepts":[{"id":"https://openalex.org/C129848803","wikidata":"https://www.wikidata.org/wiki/Q2564360","display_name":"Sample size determination","level":2,"score":0.7512999773025513},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6075999736785889},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5776000022888184},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5705000162124634},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.5674999952316284},{"id":"https://openalex.org/C169258074","wikidata":"https://www.wikidata.org/wiki/Q245748","display_name":"Random forest","level":2,"score":0.5440000295639038},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.4871000051498413},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.4729999899864197},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4724000096321106},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4514000117778778},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.4456000030040741},{"id":"https://openalex.org/C66905080","wikidata":"https://www.wikidata.org/wiki/Q17005494","display_name":"Binary classification","level":3,"score":0.4302999973297119},{"id":"https://openalex.org/C152877465","wikidata":"https://www.wikidata.org/wiki/Q208042","display_name":"Regression analysis","level":2,"score":0.36340001225471497},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.34880000352859497},{"id":"https://openalex.org/C69738355","wikidata":"https://www.wikidata.org/wiki/Q1228929","display_name":"Linear discriminant analysis","level":2,"score":0.3474000096321106},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.34549999237060547},{"id":"https://openalex.org/C52001869","wikidata":"https://www.wikidata.org/wiki/Q812530","display_name":"Naive Bayes classifier","level":3,"score":0.34060001373291016},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.3346000015735626},{"id":"https://openalex.org/C33724603","wikidata":"https://www.wikidata.org/wiki/Q812540","display_name":"Bayesian network","level":2,"score":0.33399999141693115},{"id":"https://openalex.org/C207201462","wikidata":"https://www.wikidata.org/wiki/Q182505","display_name":"Bayes' theorem","level":3,"score":0.3197999894618988},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.298799991607666},{"id":"https://openalex.org/C2779190172","wikidata":"https://www.wikidata.org/wiki/Q4913888","display_name":"Binary data","level":3,"score":0.28769999742507935},{"id":"https://openalex.org/C58471807","wikidata":"https://www.wikidata.org/wiki/Q327120","display_name":"Receiver operating characteristic","level":2,"score":0.28189998865127563},{"id":"https://openalex.org/C71743495","wikidata":"https://www.wikidata.org/wiki/Q2845210","display_name":"Power analysis","level":3,"score":0.27619999647140503},{"id":"https://openalex.org/C117312493","wikidata":"https://www.wikidata.org/wiki/Q2035437","display_name":"Multivariable calculus","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C48921125","wikidata":"https://www.wikidata.org/wiki/Q10861030","display_name":"Linear regression","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C96608239","wikidata":"https://www.wikidata.org/wiki/Q1199823","display_name":"Statistical power","level":2,"score":0.2687999904155731},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.260699987411499},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.2547000050544739},{"id":"https://openalex.org/C199335787","wikidata":"https://www.wikidata.org/wiki/Q743364","display_name":"Negative binomial distribution","level":3,"score":0.2524999976158142},{"id":"https://openalex.org/C151956035","wikidata":"https://www.wikidata.org/wiki/Q1132755","display_name":"Logistic regression","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1186/s12859-026-06372-9","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s12859-026-06372-9","pdf_url":null,"source":{"id":"https://openalex.org/S19032547","display_name":"BMC Bioinformatics","issn_l":"1471-2105","issn":["1471-2105"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"BMC Bioinformatics","raw_type":"journal-article"},{"id":"pmid:41620619","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41620619","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"BMC bioinformatics","raw_type":null},{"id":"pmh:oai:doaj.org/article:a77fe2bdd6e34ba9a97b7b0a1738dacf","is_oa":true,"landing_page_url":"https://doaj.org/article/a77fe2bdd6e34ba9a97b7b0a1738dacf","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"BMC Bioinformatics, Vol 27, Iss 1 (2026)","raw_type":"article"},{"id":"pmh:oai:pubmedcentral.nih.gov:12947515","is_oa":true,"landing_page_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC12947515/","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"BMC Bioinformatics","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1186/s12859-026-06372-9","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s12859-026-06372-9","pdf_url":null,"source":{"id":"https://openalex.org/S19032547","display_name":"BMC Bioinformatics","issn_l":"1471-2105","issn":["1471-2105"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"BMC Bioinformatics","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1981903823","https://openalex.org/W2023562174","https://openalex.org/W2109816625","https://openalex.org/W2111786429","https://openalex.org/W2135093599","https://openalex.org/W2141003052","https://openalex.org/W2155653793","https://openalex.org/W2156665896","https://openalex.org/W2157395790","https://openalex.org/W2157825442","https://openalex.org/W2166093121","https://openalex.org/W2295598076","https://openalex.org/W2333651975","https://openalex.org/W2664267452","https://openalex.org/W2795989238","https://openalex.org/W2796366395","https://openalex.org/W2801400022","https://openalex.org/W2886609621","https://openalex.org/W2906082666","https://openalex.org/W2946008612","https://openalex.org/W2974936123","https://openalex.org/W2995426409","https://openalex.org/W3109203717","https://openalex.org/W3113905359","https://openalex.org/W3159959160","https://openalex.org/W4200620750","https://openalex.org/W4206163452","https://openalex.org/W4296028038","https://openalex.org/W4308326503","https://openalex.org/W4313505502","https://openalex.org/W4317933081","https://openalex.org/W4379932297","https://openalex.org/W4385371904","https://openalex.org/W4391880292","https://openalex.org/W4399332294"],"related_works":[],"abstract_inverted_index":{"Bulk":[0],"RNA":[1],"sequencing":[2],"data":[3,141],"is":[4,31],"often":[5],"leveraged":[6],"to":[7,26,80,161],"build":[8],"machine":[9],"learning":[10],"(ML)-based":[11],"predictive":[12],"models":[13,30,98,112],"for":[14],"classification":[15],"of":[16,159],"disease":[17],"groups":[18],"or":[19],"subtypes,":[20],"but":[21],"the":[22,39,44,82,92,120],"sample":[23,77,106,125,147,162,178,184],"size":[24,185],"needed":[25,79],"adequately":[27],"train":[28],"these":[29],"unknown.":[32],"We":[33],"collected":[34],"27":[35],"experimental":[36,117],"datasets":[37,121],"from":[38,165],"Gene":[40],"Expression":[41],"Omnibus":[42],"and":[43,68,76,89,104,138],"Cancer":[45],"Genome":[46],"Atlas.":[47],"In":[48,180],"24/27":[49],"datasets,":[50],"pseudo-data":[51],"were":[52,61,74,87,113,127,142,153],"simulated":[53],"using":[54],"Bayesian":[55],"Network":[56],"Generation.":[57],"Three":[58],"ML":[59,173],"algorithms":[60],"assessed:":[62],"XGBoost":[63],"(XGB),":[64],"Random":[65],"Forest":[66],"(RF),":[67],"Neural":[69],"Networks":[70],"(NN).":[71,131],"Learning":[72],"curves":[73],"fit,":[75],"sizes":[78,107,126,163],"reach":[81],"full-dataset":[83],"AUC":[84],"minus":[85],"0.02":[86],"determined":[88],"compared":[90],"across":[91],"datasets/algorithms.":[93],"Multivariable":[94],"negative":[95],"binomial":[96],"regression":[97],"quantified":[99],"relationships":[100],"between":[101],"dataset-level":[102],"characteristics":[103],"required":[105,124,146,176],"within":[108],"each":[109],"algorithm.":[110],"These":[111],"validated":[114],"in":[115,155],"independent":[116],"datasets.":[118],"Across":[119],"studied,":[122],"median":[123],"480":[128],"(XGB)/190":[129],"(RF)/269":[130],"Higher":[132],"effect":[133],"sizes,":[134],"less":[135,139],"class":[136],"imbalance/dispersion,":[137],"complex":[140],"associated":[143],"with":[144],"lower":[145],"size.":[148],"Validation":[149],"demonstrated":[150],"that":[151,172],"predictions":[152],"accurate":[154],"new":[156],"data.":[157],"Comparison":[158],"results":[160],"obtained":[164],"differential":[166],"analysis":[167,169,190],"power":[168,189],"methods":[170,174],"showed":[171],"generally":[175],"larger":[177],"sizes.":[179],"conclusion,":[181],"incorporating":[182],"ML-based":[183],"planning":[186],"alongside":[187],"traditional":[188],"can":[191],"provide":[192],"more":[193],"robust":[194],"results.":[195]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2026-02-02T00:00:00"}
