{"id":"https://openalex.org/W4409461865","doi":"https://doi.org/10.1021/acs.jcim.5c00023","title":"Addressing Imbalanced Classification Problems in Drug Discovery and Development Using Random Forest, Support Vector Machine, AutoGluon-Tabular, and H2O AutoML","display_name":"Addressing Imbalanced Classification Problems in Drug Discovery and Development Using Random Forest, Support Vector Machine, AutoGluon-Tabular, and H2O AutoML","publication_year":2025,"publication_date":"2025-04-15","ids":{"openalex":"https://openalex.org/W4409461865","doi":"https://doi.org/10.1021/acs.jcim.5c00023","pmid":"https://pubmed.ncbi.nlm.nih.gov/40230275"},"language":"en","primary_location":{"id":"doi:10.1021/acs.jcim.5c00023","is_oa":false,"landing_page_url":"https://doi.org/10.1021/acs.jcim.5c00023","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101435114","display_name":"Ayush Garg","orcid":"https://orcid.org/0009-0006-8347-8465"},"institutions":[{"id":"https://openalex.org/I4210104194","display_name":"Tennessee Cancer Specialists","ror":"https://ror.org/01krbfc31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I4210104194"]},{"id":"https://openalex.org/I55215948","display_name":"Tata Consultancy Services (India)","ror":"https://ror.org/01b9n8m42","country_code":"IN","type":"company","lineage":["https://openalex.org/I4210086519","https://openalex.org/I55215948"]}],"countries":["IN","US"],"is_corresponding":false,"raw_author_name":"Ayush Garg","raw_affiliation_strings":["TCS Research (Life Sciences Division)","Tata Consultancy Services Limited"],"affiliations":[{"raw_affiliation_string":"TCS Research (Life Sciences Division)","institution_ids":["https://openalex.org/I4210104194"]},{"raw_affiliation_string":"Tata Consultancy Services Limited","institution_ids":["https://openalex.org/I55215948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007730641","display_name":"Narayanan Ramamurthi","orcid":"https://orcid.org/0009-0001-1067-9040"},"institutions":[{"id":"https://openalex.org/I55215948","display_name":"Tata Consultancy Services (India)","ror":"https://ror.org/01b9n8m42","country_code":"IN","type":"company","lineage":["https://openalex.org/I4210086519","https://openalex.org/I55215948"]},{"id":"https://openalex.org/I4210104194","display_name":"Tennessee Cancer Specialists","ror":"https://ror.org/01krbfc31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I4210104194"]}],"countries":["IN","US"],"is_corresponding":false,"raw_author_name":"Narayanan Ramamurthi","raw_affiliation_strings":["TCS Research (Life Sciences Division)","Tata Consultancy Services Limited"],"affiliations":[{"raw_affiliation_string":"TCS Research (Life Sciences Division)","institution_ids":["https://openalex.org/I4210104194"]},{"raw_affiliation_string":"Tata Consultancy Services Limited","institution_ids":["https://openalex.org/I55215948"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5063954917","display_name":"Shyam Sundar Das","orcid":"https://orcid.org/0009-0004-9413-1753"},"institutions":[{"id":"https://openalex.org/I4210104194","display_name":"Tennessee Cancer Specialists","ror":"https://ror.org/01krbfc31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I4210104194"]},{"id":"https://openalex.org/I55215948","display_name":"Tata Consultancy Services (India)","ror":"https://ror.org/01b9n8m42","country_code":"IN","type":"company","lineage":["https://openalex.org/I4210086519","https://openalex.org/I55215948"]}],"countries":["IN","US"],"is_corresponding":true,"raw_author_name":"Shyam Sundar Das","raw_affiliation_strings":["TCS Research (Life Sciences Division)","Tata Consultancy Services Limited"],"affiliations":[{"raw_affiliation_string":"TCS Research (Life Sciences Division)","institution_ids":["https://openalex.org/I4210104194"]},{"raw_affiliation_string":"Tata Consultancy Services Limited","institution_ids":["https://openalex.org/I55215948"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5063954917"],"corresponding_institution_ids":["https://openalex.org/I4210104194","https://openalex.org/I55215948"],"apc_list":null,"apc_paid":null,"fwci":13.0618,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.98320249,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":100},"biblio":{"volume":"65","issue":"8","first_page":"3976","last_page":"3989"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12878","display_name":"Pharmaceutical Quality and Counterfeiting","score":0.9732000231742859,"subfield":{"id":"https://openalex.org/subfields/2739","display_name":"Public Health, Environmental and Occupational Health"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11396","display_name":"Artificial Intelligence in Healthcare","score":0.9580000042915344,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/random-forest","display_name":"Random forest","score":0.8519690036773682},{"id":"https://openalex.org/keywords/support-vector-machine","display_name":"Support vector machine","score":0.6592913866043091},{"id":"https://openalex.org/keywords/drug-discovery","display_name":"Drug discovery","score":0.6520553231239319},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5262973308563232},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4844503402709961},{"id":"https://openalex.org/keywords/drug-development","display_name":"Drug development","score":0.484239786863327},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4376620054244995},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.34178394079208374},{"id":"https://openalex.org/keywords/computational-biology","display_name":"Computational biology","score":0.34148943424224854},{"id":"https://openalex.org/keywords/drug","display_name":"Drug","score":0.2967854142189026},{"id":"https://openalex.org/keywords/bioinformatics","display_name":"Bioinformatics","score":0.17872479557991028},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.16810190677642822},{"id":"https://openalex.org/keywords/pharmacology","display_name":"Pharmacology","score":0.12853088974952698}],"concepts":[{"id":"https://openalex.org/C169258074","wikidata":"https://www.wikidata.org/wiki/Q245748","display_name":"Random forest","level":2,"score":0.8519690036773682},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.6592913866043091},{"id":"https://openalex.org/C74187038","wikidata":"https://www.wikidata.org/wiki/Q1418791","display_name":"Drug discovery","level":2,"score":0.6520553231239319},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5262973308563232},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4844503402709961},{"id":"https://openalex.org/C64903051","wikidata":"https://www.wikidata.org/wiki/Q2198549","display_name":"Drug development","level":3,"score":0.484239786863327},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4376620054244995},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.34178394079208374},{"id":"https://openalex.org/C70721500","wikidata":"https://www.wikidata.org/wiki/Q177005","display_name":"Computational biology","level":1,"score":0.34148943424224854},{"id":"https://openalex.org/C2780035454","wikidata":"https://www.wikidata.org/wiki/Q8386","display_name":"Drug","level":2,"score":0.2967854142189026},{"id":"https://openalex.org/C60644358","wikidata":"https://www.wikidata.org/wiki/Q128570","display_name":"Bioinformatics","level":1,"score":0.17872479557991028},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.16810190677642822},{"id":"https://openalex.org/C98274493","wikidata":"https://www.wikidata.org/wiki/Q128406","display_name":"Pharmacology","level":1,"score":0.12853088974952698}],"mesh":[{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000076722","descriptor_name":"Drug Development","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000076722","descriptor_name":"Drug Development","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000093743","descriptor_name":"Random Forest","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000093743","descriptor_name":"Random Forest","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D055808","descriptor_name":"Drug Discovery","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D055808","descriptor_name":"Drug Discovery","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D060388","descriptor_name":"Support Vector Machine","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D060388","descriptor_name":"Support Vector Machine","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true}],"locations_count":2,"locations":[{"id":"doi:10.1021/acs.jcim.5c00023","is_oa":false,"landing_page_url":"https://doi.org/10.1021/acs.jcim.5c00023","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},{"id":"pmid:40230275","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40230275","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of chemical information and modeling","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Zero hunger","id":"https://metadata.un.org/sdg/2","score":0.49000000953674316},{"display_name":"Life in Land","id":"https://metadata.un.org/sdg/15","score":0.4099999964237213}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W1433325481","https://openalex.org/W1895993106","https://openalex.org/W1941659294","https://openalex.org/W1993220166","https://openalex.org/W2015452969","https://openalex.org/W2039412159","https://openalex.org/W2049946556","https://openalex.org/W2068258779","https://openalex.org/W2101234009","https://openalex.org/W2103919309","https://openalex.org/W2129374946","https://openalex.org/W2131046966","https://openalex.org/W2142572836","https://openalex.org/W2151241390","https://openalex.org/W2159887157","https://openalex.org/W2194079679","https://openalex.org/W2230801863","https://openalex.org/W2592074753","https://openalex.org/W2887113605","https://openalex.org/W2903362744","https://openalex.org/W2905047249","https://openalex.org/W2907473500","https://openalex.org/W2911964244","https://openalex.org/W2955219525","https://openalex.org/W2990580840","https://openalex.org/W3005880794","https://openalex.org/W3006913750","https://openalex.org/W3023211159","https://openalex.org/W3024473672","https://openalex.org/W3026299583","https://openalex.org/W3037682562","https://openalex.org/W3097920362","https://openalex.org/W3116490021","https://openalex.org/W3135028703","https://openalex.org/W3142155912","https://openalex.org/W3146401251","https://openalex.org/W3170120958","https://openalex.org/W3185271498","https://openalex.org/W3186083588","https://openalex.org/W4255056905","https://openalex.org/W4282929392","https://openalex.org/W4296162381","https://openalex.org/W4308333922","https://openalex.org/W4309115408","https://openalex.org/W4321524264","https://openalex.org/W4387346511","https://openalex.org/W4391324572","https://openalex.org/W4404147332"],"related_works":["https://openalex.org/W3193043704","https://openalex.org/W4386259002","https://openalex.org/W1546989560","https://openalex.org/W4396689146","https://openalex.org/W4200112873","https://openalex.org/W2955796858","https://openalex.org/W2004826645","https://openalex.org/W4394984040","https://openalex.org/W2127264463","https://openalex.org/W2764191054"],"abstract_inverted_index":{"The":[0,217,482],"classification":[1,39,443],"models":[2,40,432],"built":[3],"on":[4,234,394,403],"class":[5,22,36,69,101,162,170,345,368],"imbalanced":[6,100,291,442,462],"data":[7,151,157,176,271,292,314,400,455,463],"sets":[8,158,177,272,315,401,464],"tend":[9],"to":[10,33,52,98,180,263,306,352,465,486],"prioritize":[11],"the":[12,15,20,35,53,62,68,75,92,129,132,165,168,181,270,313,328,339,344,355,367,407,414,419,430,472,476,487],"accuracy":[13,336],"of":[14,55,61,64,91,94,141,145,167,200,214,220,231,290,331,357,370,399,418,453,471,510],"majority":[16],"class,":[17],"and":[18,41,48,85,108,127,143,149,171,184,193,203,207,240,244,250,257,266,281,299,309,324,360,376,421,433,490,497,513],"thus,":[19],"minority":[21],"generally":[23],"has":[24],"a":[25,88,395,500,507],"higher":[26,397],"misclassification":[27],"rate.":[28],"Different":[29],"techniques":[30,66,97,411,457,474,478],"are":[31,223,285,484],"available":[32,73],"address":[34,99],"imbalance":[37,447],"in":[38,74,82,334,354,434,494],"can":[42,273,316,502],"be":[43,274,317,503],"categorized":[44],"as":[45,198,212,224,238,427,429,469],"data-level,":[46],"algorithm-level,":[47],"hybrid":[49],"methods.":[50],"But":[51],"best":[54],"our":[56,114,221],"knowledge,":[57],"an":[58],"in-depth":[59],"analysis":[60,90],"performance":[63,93,288,468],"these":[65,80],"against":[67],"ratio":[70,166,346,369],"is":[71,228,337,347,364,381,458],"not":[72,389],"literature.":[76],"We":[77,187],"have":[78,86,117,188],"addressed":[79],"shortcomings":[81],"this":[83,495],"study":[84,501],"performed":[87],"detailed":[89],"four":[95,119],"different":[96,161],"distribution":[102],"using":[103,124,153],"machine":[104,146,196],"learning":[105,147],"(ML)":[106],"methods":[107,255,393,417,489,512],"AutoML":[109,142,209,215,296,377,424,491,514],"tools.":[110,216],"To":[111],"carry":[112],"out":[113,505],"study,":[115,496],"we":[116],"selected":[118],"such":[120,237],"techniques\u2500(a)":[121],"threshold":[122,232],"optimization":[123,233],"(i)":[125,226],"GHOST":[126],"(ii)":[128,252],"area":[130],"under":[131],"precision-recall":[133],"curve":[134],"(AUPR)":[135],"curve,":[136],"(b)":[137],"internal":[138,415,477],"balancing":[139,152,410,416,456],"method":[140],"class-weight":[144],"methods,":[148],"(c)":[150],"SMOTETomek\u2500and":[154],"generated":[155],"27":[156],"considering":[159,506],"nine":[160],"ratios":[163],"(i.e.,":[164],"positive":[169],"total":[172],"samples)":[173],"from":[174,350],"three":[175,408],"that":[178,338,383],"belong":[179],"drug":[182],"discovery":[183],"development":[185],"field.":[186],"employed":[189],"random":[190],"forest":[191],"(RF)":[192],"support":[194],"vector":[195],"(SVM)":[197],"representatives":[199,213],"ML":[201,254,375,420,431,488,511],"classifier":[202],"AutoGluon-Tabular":[204,298],"(version":[205,210],"0.6.1)":[206],"H2O":[208,300],"3.40.0.4)":[211],"important":[218],"findings":[219],"studies":[222],"follows:":[225],"there":[227],"no":[229],"effect":[230],"ranking":[235],"metrics":[236],"AUC":[239,243],"AUPR,":[241],"but":[242],"AUPR":[245],"get":[246],"affected":[247],"by":[248],"class-weighting":[249],"SMOTTomek;":[251],"for":[253,277,287,295,320,373,440,460,498],"RF":[256],"SVM,":[258],"significant":[259,302],"percentage":[260,303,332,340],"improvement":[261,304,333,341,363],"up":[262,305],"375,":[264],"33.33,":[265],"450":[267],"over":[268,311],"all":[269,312,391],"achieved,":[275,318],"respectively,":[276,319],"F1":[278,321,358,404],"score,":[279,322],"MCC,":[280,323,361],"balanced":[282,325,335],"accuracy,":[283],"which":[284],"suitable":[286],"evaluation":[289],"sets;":[293],"(iii)":[294],"libraries":[297,492],"AutoML,":[301],"383.33,":[307],"37.25,":[308],"533.33":[310],"accuracy;":[326],"(iv)":[327],"general":[329],"pattern":[330],"increases":[342],"when":[343,444],"systematically":[348],"decreased":[349],"0.5":[351],"0.1;":[353],"case":[356],"score":[359],"maximum":[362],"achieved":[365],"at":[366],"0.3;":[371],"(v)":[372],"both":[374],"with":[378,446],"balancing,":[379],"it":[380],"observed":[382],"any":[384],"individual":[385],"class-balancing":[386],"technique":[387],"does":[388],"outperform":[390,479],"other":[392],"significantly":[396],"number":[398,509],"based":[402],"score;":[405],"(vi)":[406],"external":[409,473],"combined":[412],"outperformed":[413],"AutoML;":[422],"(vii)":[423],"tools":[425],"perform":[426,437],"good":[428],"some":[435],"cases":[436],"even":[438],"better":[439],"handling":[441,448],"applied":[445],"techniques.":[449],"In":[450],"summary,":[451],"exploration":[452],"multiple":[454],"recommended":[459],"classifying":[461],"achieve":[466],"optimal":[467],"neither":[470],"nor":[475],"others":[480],"significantly.":[481],"results":[483],"specific":[485],"used":[493],"generalization,":[499],"carried":[504],"sizable":[508],"libraries.":[515]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
