{"id":"https://openalex.org/W7159114445","doi":"https://doi.org/10.48550/arxiv.2604.26498","title":"Do Larger Models Really Win in Drug Discovery? A Benchmark Assessment of Model Scaling in AI-Driven Molecular Property and Activity Prediction","display_name":"Do Larger Models Really Win in Drug Discovery? A Benchmark Assessment of Model Scaling in AI-Driven Molecular Property and Activity Prediction","publication_year":2026,"publication_date":"2026-04-29","ids":{"openalex":"https://openalex.org/W7159114445","doi":"https://doi.org/10.48550/arxiv.2604.26498"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.26498","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26498","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.26498","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057104359","display_name":"Jinjiang Guo","orcid":"https://orcid.org/0000-0002-9157-2199"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Guo, Jinjiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5057104359"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9564999938011169,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9564999938011169,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.005499999970197678,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.0052999998442828655,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/property","display_name":"Property (philosophy)","score":0.6237000226974487},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5242000222206116},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.49549999833106995},{"id":"https://openalex.org/keywords/cheminformatics","display_name":"Cheminformatics","score":0.486299991607666},{"id":"https://openalex.org/keywords/interpretation","display_name":"Interpretation (philosophy)","score":0.47360000014305115},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4345000088214874},{"id":"https://openalex.org/keywords/applicability-domain","display_name":"Applicability domain","score":0.3790000081062317},{"id":"https://openalex.org/keywords/extension","display_name":"Extension (predicate logic)","score":0.3603000044822693}],"concepts":[{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.6237000226974487},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6204000115394592},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6017000079154968},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.573199987411499},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5242000222206116},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.49549999833106995},{"id":"https://openalex.org/C68762167","wikidata":"https://www.wikidata.org/wiki/Q910164","display_name":"Cheminformatics","level":2,"score":0.486299991607666},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.47360000014305115},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4345000088214874},{"id":"https://openalex.org/C107908354","wikidata":"https://www.wikidata.org/wiki/Q4781456","display_name":"Applicability domain","level":3,"score":0.3790000081062317},{"id":"https://openalex.org/C2778029271","wikidata":"https://www.wikidata.org/wiki/Q5421931","display_name":"Extension (predicate logic)","level":2,"score":0.3603000044822693},{"id":"https://openalex.org/C2779714256","wikidata":"https://www.wikidata.org/wiki/Q25305062","display_name":"Multiple Models","level":2,"score":0.3440000116825104},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.32820001244544983},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.32670000195503235},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.29280000925064087},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.29120001196861267},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.28850001096725464},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.27549999952316284},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2639999985694885},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.2549000084400177}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.26498","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26498","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.26498","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26498","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,66],"rapid":[1],"growth":[2],"of":[3,17,169],"molecular":[4,51,115,246],"foundation":[5],"models":[6,10,26,33,146,178,241,252],"and":[7,34,56,63,71,78,103,117,132,140,150,154,176,223,248,258,274],"large":[8],"language":[9],"has":[11],"encouraged":[12],"a":[13,94,233],"scale":[14,279],"centred":[15],"view":[16],"AI":[18],"in":[19,22,99,191,260],"drug":[20],"discovery,":[21],"which":[23],"larger":[24],"pretrained":[25,114,144],"are":[27],"expected":[28],"to":[29,86,101,197],"supersede":[30],"compact":[31],"cheminformatics":[32],"graph":[35],"neural":[36],"networks":[37],"(GNNs)":[38],"trained":[39],"for":[40,50,236,245,255],"individual":[41,211],"tasks.":[42],"We":[43],"test":[44],"this":[45,170],"assumption":[46],"across":[47],"26":[48],"endpoints":[49],"properties,":[52],"toxicity,":[53],"safety":[54],"liabilities":[55],"biological":[57],"activity,":[58],"grouped":[59],"into":[60],"ADME,":[61],"toxicity":[62],"bioactivity":[64],"classes.":[65],"benchmark":[67],"contains":[68],"78":[69],"endpoint":[70],"split":[72,164,199],"entries":[73],"spanning":[74],"random,":[75],"Murcko":[76],"scaffold":[77,97],"structure":[79],"separated":[80],"5-fold":[81],"CV.":[82],"Ordered":[83],"from":[84,217],"easiest":[85],"hardest,":[87],"these":[88],"splits":[89],"approximate":[90],"retrospective":[91],"evaluation":[92],"on":[93,106,268,278],"closed":[95],"library,":[96],"expansion":[98,105],"hit":[100],"lead,":[102],"library":[104],"novel":[107],"chemotypes.":[108],"Each":[109],"entry":[110],"includes":[111],"ML,":[112],"GNN,":[113],"sequence":[116,145,177],"LLM":[118,155,186],"based":[119,156,187,231],"SAR":[120,157,188,214,256],"families.":[121],"Across":[122],"156":[123],"fold":[124],"mean":[125],"comparisons,":[126],"classical":[127],"ML":[128,161],"such":[129,137,147],"as":[130,138,148],"RF(ECFP4)":[131],"ExtraTrees(RDKit)":[133],"win":[134,142,152,159],"116,":[135],"GNNs":[136],"GIN":[139],"Ligandformer":[141],"25,":[143],"MoLFormer":[149],"ChemBERTa2":[151],"12,":[153],"baselines":[158],"three.":[160],"dominates":[162],"random":[163],"interpolation":[165],"but":[166,181,226,264],"loses":[167],"part":[168],"advantage":[171],"under":[172],"harder":[173],"splits;":[174],"GNN":[175],"also":[179],"decline":[180],"gain":[182],"relative":[183],"ground,":[184],"whereas":[185],"is":[189],"weaker":[190],"absolute":[192],"terms":[193],"yet":[194],"less":[195],"sensitive":[196],"the":[198,269],"axis.":[200],"Paired":[201],"bootstrap":[202],"analyses":[203],"support":[204],"family":[205],"level":[206],"trends":[207],"more":[208],"strongly":[209],"than":[210],"model":[212],"rankings.":[213],"knowledge":[215],"derived":[216],"training":[218],"folds":[219],"improves":[220],"many":[221],"GPT5.5-SAR":[222],"Opus4.7-SAR":[224],"metrics":[225],"does":[227],"not":[228,277],"make":[229],"rule":[230],"reasoning":[232,259],"universal":[234],"substitute":[235],"supervised":[237],"predictors.":[238],"Compact":[239],"specialized":[240],"remain":[242],"highly":[243],"effective":[244],"property":[247],"activity":[249],"prediction.":[250],"Larger":[251],"add":[253],"value":[254],"interpretation":[257],"low":[261],"data":[262],"settings,":[263],"predictive":[265],"performance":[266],"depends":[267],"fit":[270],"among":[271],"model,":[272],"task":[273],"validation":[275],"scenario,":[276],"alone.":[280]},"counts_by_year":[],"updated_date":"2026-05-19T06:13:58.757530","created_date":"2026-05-01T00:00:00"}
