{"id":"https://openalex.org/W7160279007","doi":"https://doi.org/10.48550/arxiv.2605.02745","title":"Bolek: A Multimodal Language Model for Molecular Reasoning","display_name":"Bolek: A Multimodal Language Model for Molecular Reasoning","publication_year":2026,"publication_date":"2026-05-04","ids":{"openalex":"https://openalex.org/W7160279007","doi":"https://doi.org/10.48550/arxiv.2605.02745"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.02745","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02745","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.02745","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085307089","display_name":"Frederic Grabowski","orcid":"https://orcid.org/0000-0003-4070-9500"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Grabowski, Frederic","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135321396","display_name":"Jacek Szczerbi\u0144ski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Szczerbi\u0144ski, Jacek","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135323786","display_name":"Maciej Ja\u015bkowski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ja\u015bkowski, Maciej","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014040333","display_name":"Kalina Jasinska-Kobus","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jasi\u0144ska-Kobus, Kalina","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040654786","display_name":"Pawe\u0142 D\u0105browski-Tuma\u0144ski","orcid":"https://orcid.org/0000-0001-9412-1716"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"D\u0105browski-Tuma\u0144ski, Pawe\u0142","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083420446","display_name":"Tomasz Jetka","orcid":"https://orcid.org/0000-0001-7449-9818"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jetka, Tomasz","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5044886432","display_name":"Bartosz Topolski","orcid":"https://orcid.org/0000-0002-7126-9424"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Topolski, Bartosz","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.8055999875068665,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.8055999875068665,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.15950000286102295,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.0031999999191612005,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.499099999666214},{"id":"https://openalex.org/keywords/binary-number","display_name":"Binary number","score":0.4307999908924103},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4284999966621399},{"id":"https://openalex.org/keywords/property","display_name":"Property (philosophy)","score":0.40450000762939453},{"id":"https://openalex.org/keywords/rank","display_name":"Rank (graph theory)","score":0.39320001006126404},{"id":"https://openalex.org/keywords/contrast","display_name":"Contrast (vision)","score":0.38679999113082886},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3449999988079071},{"id":"https://openalex.org/keywords/binary-classification","display_name":"Binary classification","score":0.34139999747276306}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5907999873161316},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5633999705314636},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5626000165939331},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.499099999666214},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.4307999908924103},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4284999966621399},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.40450000762939453},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.39320001006126404},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.38679999113082886},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.37610000371932983},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3449999988079071},{"id":"https://openalex.org/C66905080","wikidata":"https://www.wikidata.org/wiki/Q17005494","display_name":"Binary classification","level":3,"score":0.34139999747276306},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3375999927520752},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.32820001244544983},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.30410000681877136},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C2780069185","wikidata":"https://www.wikidata.org/wiki/Q7977945","display_name":"Equivalence (formal languages)","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.02745","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02745","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.02745","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02745","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7180926203727722}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Molecular":[0],"property":[1],"models":[2,24],"increasingly":[3],"support":[4],"high-stakes":[5],"drug-discovery":[6],"decisions,":[7],"but":[8],"their":[9],"outputs":[10],"are":[11,145],"often":[12,160],"difficult":[13],"to":[14,123,230],"audit:":[15],"classical":[16],"predictors":[17],"return":[18],"scores":[19],"without":[20],"rationale,":[21],"while":[22],"language":[23,41],"can":[25,234],"produce":[26],"fluent":[27],"explanations":[28,144],"weakly":[29],"grounded":[30,147],"in":[31,47,91,106,114],"the":[32,151,164,187],"input":[33],"molecule.":[34],"We":[35],"introduce":[36],"Bolek,":[37],"a":[38,52],"compact":[39],"multimodal":[40],"model":[42],"that":[43,222],"grounds":[44],"natural-language":[45],"reasoning":[46,80,227,239],"molecular":[48,65,93,232,238],"structure":[49],"by":[50],"injecting":[51],"Morgan":[53],"fingerprint":[54],"embedding":[55],"into":[56],"an":[57],"instruction-tuned":[58],"text":[59],"decoder.":[60],"Bolek":[61,98,196],"is":[62],"fine-tuned":[63],"on":[64,78,103,110,129,190,199,207],"alignment":[66],"tasks,":[67,97],"including":[68],"molecule":[69],"description,":[70],"RDKit":[71,170],"descriptor":[72],"prediction,":[73],"and":[74,77,109,163,178,201,226],"substructure":[75],"detection,":[76],"downstream":[79,215],"over":[81],"15":[82,113,132,191],"TDC":[83,193],"binary":[84,133],"classification":[85,134,194],"tasks":[86,135],"using":[87],"synthetic":[88],"chains-of-thought":[89],"anchored":[90],"concrete":[92],"features.":[94],"Across":[95],"these":[96],"outperforms":[99,127],"its":[100,141],"Qwen3-4B-Instruct":[101],"base":[102],"all":[104],"endpoints":[105,211],"yes/no":[107],"mode":[108],"13":[111,130],"of":[112,131,150],"chain-of-thought":[115],"mode,":[116],"raising":[117],"mean":[118],"ROC/PR":[119],"AUC":[120],"from":[121],"0.55":[122],"0.76.":[124],"It":[125],"also":[126],"TxGemma-9B-Chat":[128],"despite":[136,212],"being":[137],"less":[138],"than":[139,148],"half":[140],"size.":[142],"Bolek's":[143],"more":[146,159],"those":[149],"baseline":[152],"LLMs:":[153],"it":[154,202],"cites":[155],"numerical":[156],"descriptors":[157,173],"10-100x":[158],"per":[161],"chain-of-thought,":[162],"cited":[165],"values":[166],"agree":[167],"strongly":[168],"with":[169],"for":[171],"key":[172],"such":[174],"as":[175],"TPSA,":[176],"MolLogP,":[177],"MolWt":[179],"(Spearman":[180],"rho":[181],"=":[182],"0.87-0.91).":[183],"Generalisation":[184],"extends":[185],"beyond":[186],"training":[188],"panel:":[189],"unseen":[192],"endpoints,":[195],"matches":[197],"TxGemma":[198],"five,":[200],"produces":[203],"non-trivial":[204],"rank":[205],"correlations":[206],"three":[208],"held-out":[209],"regression":[210,216],"never":[213],"seeing":[214],"during":[217],"training.":[218],"These":[219],"results":[220],"suggest":[221],"targeted":[223],"modality":[224],"injection":[225],"supervision":[228],"tied":[229],"verifiable":[231],"features":[233],"yield":[235],"compact,":[236],"auditable":[237],"models.":[240]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-06T00:00:00"}
