{"id":"https://openalex.org/W4409729737","doi":"https://doi.org/10.1186/s13321-025-00986-6","title":"Molecular property prediction using pretrained-BERT and Bayesian active learning: a data-efficient approach to drug design","display_name":"Molecular property prediction using pretrained-BERT and Bayesian active learning: a data-efficient approach to drug design","publication_year":2025,"publication_date":"2025-04-23","ids":{"openalex":"https://openalex.org/W4409729737","doi":"https://doi.org/10.1186/s13321-025-00986-6","pmid":"https://pubmed.ncbi.nlm.nih.gov/40270038"},"language":"en","primary_location":{"id":"doi:10.1186/s13321-025-00986-6","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13321-025-00986-6","pdf_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-00986-6","source":{"id":"https://openalex.org/S180838163","display_name":"Journal of Cheminformatics","issn_l":"1758-2946","issn":["1758-2946"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Cheminformatics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-00986-6","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030874161","display_name":"Muhammad Arslan Masood","orcid":"https://orcid.org/0000-0002-9190-3023"},"institutions":[{"id":"https://openalex.org/I9927081","display_name":"Aalto University","ror":"https://ror.org/020hwjq30","country_code":"FI","type":"education","lineage":["https://openalex.org/I9927081"]}],"countries":["FI"],"is_corresponding":true,"raw_author_name":"Muhammad Arslan Masood","raw_affiliation_strings":["Department of Computer Science, Aalto University, Espoo, Finland. arslan.masood@aalto.fi"],"raw_orcid":"https://orcid.org/0000-0002-9190-3023","affiliations":[{"raw_affiliation_string":"Department of Computer Science, Aalto University, Espoo, Finland. arslan.masood@aalto.fi","institution_ids":["https://openalex.org/I9927081"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018305257","display_name":"Samuel Kaski","orcid":"https://orcid.org/0000-0003-1925-9154"},"institutions":[{"id":"https://openalex.org/I28407311","display_name":"University of Manchester","ror":"https://ror.org/027m9bs27","country_code":"GB","type":"education","lineage":["https://openalex.org/I28407311"]},{"id":"https://openalex.org/I9927081","display_name":"Aalto University","ror":"https://ror.org/020hwjq30","country_code":"FI","type":"education","lineage":["https://openalex.org/I9927081"]}],"countries":["FI","GB"],"is_corresponding":false,"raw_author_name":"Samuel Kaski","raw_affiliation_strings":["Department of Computer Science, Aalto University, Espoo, Finland","Department of Computer Science, University of Manchester, Manchester, UK"],"raw_orcid":"https://orcid.org/0000-0003-1925-9154","affiliations":[{"raw_affiliation_string":"Department of Computer Science, Aalto University, Espoo, Finland","institution_ids":["https://openalex.org/I9927081"]},{"raw_affiliation_string":"Department of Computer Science, University of Manchester, Manchester, UK","institution_ids":["https://openalex.org/I28407311"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101919748","display_name":"Tianyu Cui","orcid":"https://orcid.org/0000-0003-2392-0689"},"institutions":[{"id":"https://openalex.org/I9927081","display_name":"Aalto University","ror":"https://ror.org/020hwjq30","country_code":"FI","type":"education","lineage":["https://openalex.org/I9927081"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Tianyu Cui","raw_affiliation_strings":["Department of Computer Science, Aalto University, Espoo, Finland"],"raw_orcid":"https://orcid.org/0000-0003-2392-0689","affiliations":[{"raw_affiliation_string":"Department of Computer Science, Aalto University, Espoo, Finland","institution_ids":["https://openalex.org/I9927081"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5030874161"],"corresponding_institution_ids":["https://openalex.org/I9927081"],"apc_list":{"value":1290,"currency":"GBP","value_usd":1582},"apc_paid":{"value":1290,"currency":"GBP","value_usd":1582},"fwci":5.2362,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.9532306,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"17","issue":"1","first_page":"58","last_page":"58"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.993399977684021,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10908","display_name":"Analytical Chemistry and Chromatography","score":0.9868000149726868,"subfield":{"id":"https://openalex.org/subfields/1607","display_name":"Spectroscopy"},"field":{"id":"https://openalex.org/fields/16","display_name":"Chemistry"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8082820177078247},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.6771834492683411},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.638788640499115},{"id":"https://openalex.org/keywords/active-learning","display_name":"Active learning (machine learning)","score":0.5978837609291077},{"id":"https://openalex.org/keywords/drug-discovery","display_name":"Drug discovery","score":0.5886090397834778},{"id":"https://openalex.org/keywords/chemical-space","display_name":"Chemical space","score":0.5758792161941528},{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.47142547369003296},{"id":"https://openalex.org/keywords/semi-supervised-learning","display_name":"Semi-supervised learning","score":0.44547536969184875},{"id":"https://openalex.org/keywords/multi-task-learning","display_name":"Multi-task learning","score":0.4338929057121277},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3507709801197052},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.22822332382202148}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8082820177078247},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6771834492683411},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.638788640499115},{"id":"https://openalex.org/C77967617","wikidata":"https://www.wikidata.org/wiki/Q4677561","display_name":"Active learning (machine learning)","level":2,"score":0.5978837609291077},{"id":"https://openalex.org/C74187038","wikidata":"https://www.wikidata.org/wiki/Q1418791","display_name":"Drug discovery","level":2,"score":0.5886090397834778},{"id":"https://openalex.org/C99726746","wikidata":"https://www.wikidata.org/wiki/Q906396","display_name":"Chemical space","level":3,"score":0.5758792161941528},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.47142547369003296},{"id":"https://openalex.org/C58973888","wikidata":"https://www.wikidata.org/wiki/Q1041418","display_name":"Semi-supervised learning","level":2,"score":0.44547536969184875},{"id":"https://openalex.org/C28006648","wikidata":"https://www.wikidata.org/wiki/Q6934509","display_name":"Multi-task learning","level":3,"score":0.4338929057121277},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3507709801197052},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.22822332382202148},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C60644358","wikidata":"https://www.wikidata.org/wiki/Q128570","display_name":"Bioinformatics","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":6,"locations":[{"id":"doi:10.1186/s13321-025-00986-6","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13321-025-00986-6","pdf_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-00986-6","source":{"id":"https://openalex.org/S180838163","display_name":"Journal of Cheminformatics","issn_l":"1758-2946","issn":["1758-2946"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Cheminformatics","raw_type":"journal-article"},{"id":"pmid:40270038","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40270038","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of cheminformatics","raw_type":null},{"id":"pmh:oai:pure.atira.dk:publications/076c4e25-1dfd-44f9-ab2f-119e545ae15b","is_oa":false,"landing_page_url":"https://research.manchester.ac.uk/en/publications/076c4e25-1dfd-44f9-ab2f-119e545ae15b","pdf_url":null,"source":{"id":"https://openalex.org/S4306400662","display_name":"Research Explorer (The University of Manchester)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I28407311","host_organization_name":"University of Manchester","host_organization_lineage":["https://openalex.org/I28407311"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Masood, M A, Kaski, S & Cui, T 2025, 'Molecular property prediction using pretrained-BERT and Bayesian active learning : a data-efficient approach to drug design', Journal of Cheminformatics, vol. 17, no. 1, 58, pp. 1-15. https://doi.org/10.1186/s13321-025-00986-6","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:aaltodoc.aalto.fi:123456789/135228","is_oa":true,"landing_page_url":"http://www.scopus.com/inward/record.url?scp=105003141656&partnerID=8YFLogxK","pdf_url":null,"source":{"id":"https://openalex.org/S4306401663","display_name":"Aaltodoc (Aalto University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I9927081","host_organization_name":"Aalto University","host_organization_lineage":["https://openalex.org/I9927081"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":null,"raw_type":"publishedVersion"},{"id":"pmh:oai:doaj.org/article:f992f8c7ec9841af9ec87cd24da29c96","is_oa":true,"landing_page_url":"https://doaj.org/article/f992f8c7ec9841af9ec87cd24da29c96","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Journal of Cheminformatics, Vol 17, Iss 1, Pp 1-15 (2025)","raw_type":"article"},{"id":"pmh:oai:europepmc.org:10830466","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/12020163","pdf_url":null,"source":{"id":"https://openalex.org/S4306400806","display_name":"Europe PMC (PubMed Central)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1303153112","host_organization_name":"European Bioinformatics Institute","host_organization_lineage":["https://openalex.org/I1303153112"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1186/s13321-025-00986-6","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13321-025-00986-6","pdf_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-00986-6","source":{"id":"https://openalex.org/S180838163","display_name":"Journal of Cheminformatics","issn_l":"1758-2946","issn":["1758-2946"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Cheminformatics","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G223242278","display_name":null,"funder_award_id":"956832","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G448529986","display_name":null,"funder_award_id":"EP/W002973/1","funder_id":"https://openalex.org/F4320314731","funder_display_name":"UK Research and Innovation"},{"id":"https://openalex.org/G7842005466","display_name":null,"funder_award_id":"Horizon 2020","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"}],"funders":[{"id":"https://openalex.org/F4320314731","display_name":"UK Research and Innovation","ror":"https://ror.org/001aqnf71"},{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"},{"id":"https://openalex.org/F4320332050","display_name":"Finnish Center for Artificial Intelligence","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4409729737.pdf","grobid_xml":"https://content.openalex.org/works/W4409729737.grobid-xml"},"referenced_works_count":30,"referenced_works":["https://openalex.org/W1991238353","https://openalex.org/W2033757486","https://openalex.org/W2051224630","https://openalex.org/W2060531713","https://openalex.org/W2085890279","https://openalex.org/W2225156818","https://openalex.org/W2461620095","https://openalex.org/W2591883888","https://openalex.org/W2900415334","https://openalex.org/W2911789160","https://openalex.org/W2962764565","https://openalex.org/W3014596384","https://openalex.org/W3041085747","https://openalex.org/W3044724994","https://openalex.org/W3097280976","https://openalex.org/W3098465726","https://openalex.org/W3104956673","https://openalex.org/W3134774296","https://openalex.org/W3183564643","https://openalex.org/W3213984072","https://openalex.org/W4230030242","https://openalex.org/W4255683093","https://openalex.org/W4387003134","https://openalex.org/W4390221451","https://openalex.org/W4391943965","https://openalex.org/W4392452724","https://openalex.org/W4400784949","https://openalex.org/W4402032238","https://openalex.org/W4405183594","https://openalex.org/W6608287133"],"related_works":["https://openalex.org/W2056314584","https://openalex.org/W4388943133","https://openalex.org/W4388499548","https://openalex.org/W2365028544","https://openalex.org/W4309984931","https://openalex.org/W4282977123","https://openalex.org/W1986633584","https://openalex.org/W2584335068","https://openalex.org/W2949671220","https://openalex.org/W1492505081"],"abstract_inverted_index":{"In":[0],"drug":[1,165,188],"discovery,":[2,166,189],"prioritizing":[3],"compounds":[4],"for":[5,39,171,224],"experimental":[6],"testing":[7],"is":[8,36],"a":[9,68,127,168,196,222],"critical":[10,214],"task":[11],"that":[12,103,122,148,178,198],"can":[13],"be":[14],"optimized":[15],"through":[16,140],"active":[17,79,118,154,184,205],"learning":[18,25,80,86,155,185,206,210],"by":[19,66],"strategically":[20],"selecting":[21],"informative":[22],"molecules.":[23],"Active":[24],"typically":[26],"trains":[27],"models":[28,202],"on":[29,73,97],"labeled":[30,137],"examples":[31],"alone,":[32],"while":[33],"unlabeled":[34,50],"data":[35],"only":[37],"used":[38],"acquisition.":[40],"This":[41,82,145,219],"fully":[42],"supervised":[43],"approach":[44,105,220],"neglects":[45],"valuable":[46],"information":[47],"present":[48],"in":[49,164,187,216],"molecular":[51,151,180],"data,":[52,138],"impairing":[53],"both":[54,158],"predictive":[55],"performance":[56,160],"and":[57,87,99,161],"the":[58,78],"molecule":[59,94],"selection":[60],"process.":[61],"We":[62,176,194],"address":[63],"this":[64],"limitation":[65],"integrating":[67],"transformer-based":[69],"BERT":[70,124],"model,":[71],"pretrained":[72,123,150,200],"1.26":[74],"million":[75],"compounds,":[76],"into":[77],"pipeline.":[81],"effectively":[83],"disentangles":[84],"representation":[85,209],"uncertainty":[88,133,212],"estimation,":[89],"leading":[90],"to":[91,116,207],"more":[92,225],"reliable":[93,132],"selection.":[95,193],"Experiments":[96],"Tox21":[98],"ClinTox":[100],"datasets":[101],"demonstrate":[102,177],"our":[104],"achieves":[106],"equivalent":[107],"toxic":[108],"compound":[109,172],"identification":[110],"with":[111,153,203],"50%":[112],"fewer":[113],"iterations":[114],"compared":[115],"conventional":[117],"learning.":[119],"Analysis":[120],"reveals":[121],"representations":[125,152,181],"generate":[126],"structured":[128],"embedding":[129],"space":[130],"enabling":[131],"estimation":[134],"despite":[135],"limited":[136],"confirmed":[139],"Expected":[141],"Calibration":[142],"Error":[143],"measurements.":[144],"work":[146],"establishes":[147,221],"combining":[149],"significantly":[156],"improves":[157],"model":[159],"acquisition":[162,191],"efficiency":[163],"providing":[167],"scalable":[169],"framework":[170,197],"prioritization.":[173],"SCIENTIFIC":[174],"CONTRIBUTION:":[175],"high-quality":[179],"fundamentally":[182],"determine":[183],"success":[186],"outweighing":[190],"strategy":[192],"provide":[195],"integrates":[199],"transformer":[201],"Bayesian":[204],"separate":[208],"from":[211],"estimation-a":[213],"distinction":[215],"low-data":[217],"scenarios.":[218],"foundation":[223],"efficient":[226],"screening":[227],"workflows":[228],"across":[229],"diverse":[230],"pharmaceutical":[231],"applications.":[232]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2026-05-02T08:42:23.175194","created_date":"2025-10-10T00:00:00"}
