{"id":"https://openalex.org/W4416337313","doi":"https://doi.org/10.1186/s13321-025-01099-w","title":"Beyond performance: how design choices shape chemical language models","display_name":"Beyond performance: how design choices shape chemical language models","publication_year":2025,"publication_date":"2025-11-18","ids":{"openalex":"https://openalex.org/W4416337313","doi":"https://doi.org/10.1186/s13321-025-01099-w","pmid":"https://pubmed.ncbi.nlm.nih.gov/41254742"},"language":"en","primary_location":{"id":"doi:10.1186/s13321-025-01099-w","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13321-025-01099-w","pdf_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-01099-w","source":{"id":"https://openalex.org/S180838163","display_name":"Journal of Cheminformatics","issn_l":"1758-2946","issn":["1758-2946"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Cheminformatics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-01099-w","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5092622713","display_name":"Inken Fender","orcid":"https://orcid.org/0009-0000-1230-0260"},"institutions":[{"id":"https://openalex.org/I118564535","display_name":"University of Bern","ror":"https://ror.org/02k7v4d05","country_code":"CH","type":"education","lineage":["https://openalex.org/I118564535"]}],"countries":["CH"],"is_corresponding":true,"raw_author_name":"Inken Fender","raw_affiliation_strings":["Graduate School for Cellular and Biomedical Sciences (GCB), University of Bern, Mittelstrasse 43, 3012, Bern, Switzerland","Institute of Biochemistry and Molecular Medicine, University of Bern, B\u00fchlstrasse 28, 3012, Bern, Switzerland"],"affiliations":[{"raw_affiliation_string":"Graduate School for Cellular and Biomedical Sciences (GCB), University of Bern, Mittelstrasse 43, 3012, Bern, Switzerland","institution_ids":["https://openalex.org/I118564535"]},{"raw_affiliation_string":"Institute of Biochemistry and Molecular Medicine, University of Bern, B\u00fchlstrasse 28, 3012, Bern, Switzerland","institution_ids":["https://openalex.org/I118564535"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5094173190","display_name":"Jannik Adrian Gut","orcid":null},"institutions":[{"id":"https://openalex.org/I118564535","display_name":"University of Bern","ror":"https://ror.org/02k7v4d05","country_code":"CH","type":"education","lineage":["https://openalex.org/I118564535"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Jannik Adrian Gut","raw_affiliation_strings":["Graduate School for Cellular and Biomedical Sciences (GCB), University of Bern, Mittelstrasse 43, 3012, Bern, Switzerland","Institute of Biochemistry and Molecular Medicine, University of Bern, B\u00fchlstrasse 28, 3012, Bern, Switzerland"],"affiliations":[{"raw_affiliation_string":"Graduate School for Cellular and Biomedical Sciences (GCB), University of Bern, Mittelstrasse 43, 3012, Bern, Switzerland","institution_ids":["https://openalex.org/I118564535"]},{"raw_affiliation_string":"Institute of Biochemistry and Molecular Medicine, University of Bern, B\u00fchlstrasse 28, 3012, Bern, Switzerland","institution_ids":["https://openalex.org/I118564535"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069828726","display_name":"Thomas Lemmin","orcid":"https://orcid.org/0000-0001-5705-4964"},"institutions":[{"id":"https://openalex.org/I118564535","display_name":"University of Bern","ror":"https://ror.org/02k7v4d05","country_code":"CH","type":"education","lineage":["https://openalex.org/I118564535"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Thomas Lemmin","raw_affiliation_strings":["Institute of Biochemistry and Molecular Medicine, University of Bern, B\u00fchlstrasse 28, 3012, Bern, Switzerland. thomas.lemmin@unibe.ch","Institute of Biochemistry and Molecular Medicine, University of Bern, B\u00fchlstrasse 28, 3012, Bern, Switzerland"],"affiliations":[{"raw_affiliation_string":"Institute of Biochemistry and Molecular Medicine, University of Bern, B\u00fchlstrasse 28, 3012, Bern, Switzerland. thomas.lemmin@unibe.ch","institution_ids":["https://openalex.org/I118564535"]},{"raw_affiliation_string":"Institute of Biochemistry and Molecular Medicine, University of Bern, B\u00fchlstrasse 28, 3012, Bern, Switzerland","institution_ids":["https://openalex.org/I118564535"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5092622713"],"corresponding_institution_ids":["https://openalex.org/I118564535"],"apc_list":{"value":1290,"currency":"GBP","value_usd":1582},"apc_paid":{"value":1290,"currency":"GBP","value_usd":1582},"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.25934262,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"17","issue":"1","first_page":"173","last_page":"173"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.883400022983551,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.883400022983551,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.07940000295639038,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.003100000089034438,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.9545000195503235},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5568000078201294},{"id":"https://openalex.org/keywords/chemical-space","display_name":"Chemical space","score":0.5343000292778015},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.5252000093460083},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5120999813079834},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4625999927520752},{"id":"https://openalex.org/keywords/curse-of-dimensionality","display_name":"Curse of dimensionality","score":0.4519999921321869},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.4065000116825104}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.9545000195503235},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8022000193595886},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5568000078201294},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5479000210762024},{"id":"https://openalex.org/C99726746","wikidata":"https://www.wikidata.org/wiki/Q906396","display_name":"Chemical space","level":3,"score":0.5343000292778015},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.5252000093460083},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5120999813079834},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.48489999771118164},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4625999927520752},{"id":"https://openalex.org/C111030470","wikidata":"https://www.wikidata.org/wiki/Q1430460","display_name":"Curse of dimensionality","level":2,"score":0.4519999921321869},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4505000114440918},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.4065000116825104},{"id":"https://openalex.org/C203394866","wikidata":"https://www.wikidata.org/wiki/Q2881060","display_name":"Chemical database","level":2,"score":0.40459999442100525},{"id":"https://openalex.org/C70518039","wikidata":"https://www.wikidata.org/wiki/Q16000077","display_name":"Dimensionality reduction","level":2,"score":0.3847000002861023},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.36910000443458557},{"id":"https://openalex.org/C103697762","wikidata":"https://www.wikidata.org/wiki/Q4112105","display_name":"Virtual screening","level":3,"score":0.3353999853134155},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.3303999900817871},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.3287999927997589},{"id":"https://openalex.org/C68762167","wikidata":"https://www.wikidata.org/wiki/Q910164","display_name":"Cheminformatics","level":2,"score":0.3212999999523163},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.2971000075340271},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2874000072479248},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.2662999927997589},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.2554999887943268},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25429999828338623}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1186/s13321-025-01099-w","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13321-025-01099-w","pdf_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-01099-w","source":{"id":"https://openalex.org/S180838163","display_name":"Journal of Cheminformatics","issn_l":"1758-2946","issn":["1758-2946"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Cheminformatics","raw_type":"journal-article"},{"id":"pmid:41254742","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41254742","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of cheminformatics","raw_type":null},{"id":"pmh:oai:europepmc.org:11433274","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/12625328","pdf_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC12625328/pdf/13321_2025_Article_1099.pdf","source":{"id":"https://openalex.org/S4306400806","display_name":"Europe PMC (PubMed Central)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1303153112","host_organization_name":"European Bioinformatics Institute","host_organization_lineage":["https://openalex.org/I1303153112"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Text"},{"id":"pmh:oai:doaj.org/article:c9fafc83473e429abf8b8ddf937a2300","is_oa":true,"landing_page_url":"https://doaj.org/article/c9fafc83473e429abf8b8ddf937a2300","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Journal of Cheminformatics, Vol 17, Iss 1, Pp 1-15 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1186/s13321-025-01099-w","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13321-025-01099-w","pdf_url":"https://jcheminf.biomedcentral.com/counter/pdf/10.1186/s13321-025-01099-w","source":{"id":"https://openalex.org/S180838163","display_name":"Journal of Cheminformatics","issn_l":"1758-2946","issn":["1758-2946"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Cheminformatics","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G6833106771","display_name":null,"funder_award_id":"194606","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320320924","display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung","ror":"https://ror.org/00yjd3n13"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416337313.pdf","grobid_xml":"https://content.openalex.org/works/W4416337313.grobid-xml"},"referenced_works_count":34,"referenced_works":["https://openalex.org/W1541765387","https://openalex.org/W1650728208","https://openalex.org/W1857789879","https://openalex.org/W1968511284","https://openalex.org/W1975147762","https://openalex.org/W1976161355","https://openalex.org/W2015688756","https://openalex.org/W2034549041","https://openalex.org/W2066273100","https://openalex.org/W2135046866","https://openalex.org/W2461093577","https://openalex.org/W2895677720","https://openalex.org/W2909063104","https://openalex.org/W2933138175","https://openalex.org/W2947423323","https://openalex.org/W2963250244","https://openalex.org/W2963341956","https://openalex.org/W3009321976","https://openalex.org/W3034999214","https://openalex.org/W3086801715","https://openalex.org/W3138781613","https://openalex.org/W4281619372","https://openalex.org/W4306179830","https://openalex.org/W4307468223","https://openalex.org/W4313485929","https://openalex.org/W4379184641","https://openalex.org/W4388297464","https://openalex.org/W4399528430","https://openalex.org/W4400046055","https://openalex.org/W4401035236","https://openalex.org/W4401558372","https://openalex.org/W4401626968","https://openalex.org/W4403691894","https://openalex.org/W4404599510"],"related_works":[],"abstract_inverted_index":{"Chemical":[0],"language":[1],"models":[2,56],"(CLMs)":[3],"have":[4],"shown":[5],"strong":[6],"performance":[7,33,50,82],"in":[8,92],"molecular":[9,23],"property":[10],"prediction":[11,132],"and":[12,28,34,51,62,75,95,118,151],"generation":[13],"tasks.":[14],"However,":[15],"the":[16,64,93,145],"impact":[17],"of":[18,66,97,147],"design":[19,102],"choices,":[20],"such":[21],"as":[22,134],"representation":[24],"format,":[25],"tokenization":[26,114],"strategy,":[27],"model":[29,86,121],"architecture,":[30],"on":[31,59],"both":[32],"chemical":[35,52,107],"interpretability":[36,96],"remains":[37,125],"underexplored.":[38],"In":[39,111],"this":[40],"study,":[41],"we":[42],"systematically":[43],"evaluate":[44],"how":[45,106],"these":[46],"factors":[47],"influence":[48],"CLM":[49],"understanding.":[53],"We":[54],"evaluated":[55],"through":[57],"fine-tuning":[58],"downstream":[60,80],"tasks":[61],"probing":[63,71],"structure":[65,94],"their":[67],"latent":[68],"spaces":[69],"using":[70],"predictors,":[72],"vector":[73],"operations,":[74],"dimensionality":[76],"reduction":[77],"techniques.":[78],"Although":[79],"task":[81],"was":[83],"similar":[84],"across":[85],"configurations,":[87],"substantial":[88],"differences":[89],"were":[90],"observed":[91],"internal":[98],"representations,":[99],"highlighting":[100],"that":[101],"choices":[103],"meaningfully":[104],"shape":[105],"information":[108],"is":[109],"encoded.":[110],"practice,":[112],"atomwise":[113],"generally":[115],"improved":[116],"interpretability,":[117],"a":[119,126],"RoBERTa-based":[120],"with":[122],"SMILES":[123],"input":[124],"reliable":[127],"starting":[128],"point":[129],"for":[130,144],"standard":[131],"tasks,":[133],"no":[135],"alternative":[136],"consistently":[137],"outperformed":[138],"it.":[139],"These":[140],"results":[141],"provide":[142],"guidance":[143],"development":[146],"more":[148],"chemically":[149],"grounded":[150],"interpretable":[152],"CLMs.":[153]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-11-18T00:00:00"}
