{"id":"https://openalex.org/W7129729748","doi":"https://doi.org/10.48550/arxiv.2602.13958","title":"Chemical Language Models for Natural Products: A State-Space Model Approach","display_name":"Chemical Language Models for Natural Products: A State-Space Model Approach","publication_year":2026,"publication_date":"2026-02-15","ids":{"openalex":"https://openalex.org/W7129729748","doi":"https://doi.org/10.48550/arxiv.2602.13958"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.13958","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126257677","display_name":"Ho-Hsuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Ho-Hsuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006373195","display_name":"Afnan Sultan","orcid":"https://orcid.org/0009-0000-6126-7153"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sultan, Afnan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123946175","display_name":"Andrea Volkamer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Volkamer, Andrea","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126181080","display_name":"Dietrich Klakow","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Klakow, Dietrich","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5126257677"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.48579999804496765,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.48579999804496765,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.43320000171661377,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.0215000007301569,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/property","display_name":"Property (philosophy)","score":0.6187999844551086},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.588100016117096},{"id":"https://openalex.org/keywords/dependency","display_name":"Dependency (UML)","score":0.47999998927116394},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.46459999680519104},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.43529999256134033},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.4187999963760376},{"id":"https://openalex.org/keywords/computational-model","display_name":"Computational model","score":0.4097000062465668},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.34860000014305115}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6912999749183655},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.6187999844551086},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.588100016117096},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5631999969482422},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5302000045776367},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.47999998927116394},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.46459999680519104},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.43529999256134033},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.4187999963760376},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.4097000062465668},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3490000069141388},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.34860000014305115},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.310699999332428},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.3089999854564667},{"id":"https://openalex.org/C74187038","wikidata":"https://www.wikidata.org/wiki/Q1418791","display_name":"Drug discovery","level":2,"score":0.30250000953674316},{"id":"https://openalex.org/C169258074","wikidata":"https://www.wikidata.org/wiki/Q245748","display_name":"Random forest","level":2,"score":0.3003000020980835},{"id":"https://openalex.org/C2994228675","wikidata":"https://www.wikidata.org/wiki/Q512599","display_name":"Property value","level":3,"score":0.2980000078678131},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.27730000019073486},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.27570000290870667},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.26179999113082886},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2615000009536743}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.13958","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.13958","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.13958","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.13958","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Language":[0],"models":[1,35,40,67,162],"are":[2],"widely":[3],"used":[4],"in":[5,23],"chemistry":[6],"for":[7,70],"molecular":[8],"property":[9,96,133],"prediction":[10,97],"and":[11,42,44,68,85,95,105,113,118],"small-molecule":[12],"generation,":[13],"yet":[14],"Natural":[15],"Products":[16],"(NPs)":[17],"remain":[18],"underexplored":[19],"despite":[20],"their":[21],"importance":[22],"drug":[24],"discovery.":[25],"To":[26],"address":[27],"this":[28],"gap,":[29],"we":[30,58],"develop":[31],"NP-specific":[32,86],"chemical":[33],"language":[34],"(NPCLMs)":[36],"by":[37,139],"pre-training":[38,155],"state-space":[39,66],"(Mamba":[41],"Mamba-2)":[43],"comparing":[45],"them":[46],"with":[47,74,120],"transformer":[48],"baselines":[49],"(GPT).":[50],"Using":[51],"a":[52],"dataset":[53],"of":[54,64],"about":[55,157],"1M":[56,158],"NPs,":[57],"present":[59],"the":[60],"first":[61],"systematic":[62],"comparison":[63],"selective":[65],"transformers":[69],"NP-focused":[71],"tasks,":[72],"together":[73],"eight":[75],"tokenization":[76],"strategies":[77],"including":[78],"character-level,":[79],"Atom-in-SMILES":[80],"(AIS),":[81],"byte-pair":[82],"encoding":[83],"(BPE),":[84],"BPE.":[87],"We":[88],"evaluate":[89],"molecule":[90],"generation":[91],"(validity,":[92],"uniqueness,":[93],"novelty)":[94],"(membrane":[98],"permeability,":[99],"taste,":[100],"anti-cancer":[101],"activity)":[102],"using":[103],"MCC":[104,141],"AUC-ROC.":[106],"Mamba":[107,135],"generates":[108],"1-2":[109],"percent":[110],"more":[111,129],"valid":[112],"unique":[114],"molecules":[115],"than":[116],"Mamba-2":[117],"GPT,":[119],"fewer":[121],"long-range":[122],"dependency":[123],"errors,":[124],"while":[125,145],"GPT":[126,138],"yields":[127],"slightly":[128],"novel":[130],"structures.":[131],"For":[132],"prediction,":[134],"variants":[136],"outperform":[137],"0.02-0.04":[140],"under":[142],"random":[143],"splits,":[144],"scaffold":[146],"splits":[147],"show":[148],"comparable":[149],"performance.":[150],"Results":[151],"demonstrate":[152],"that":[153],"domain-specific":[154],"on":[156,164],"NPs":[159],"can":[160],"match":[161],"trained":[163],"datasets":[165],"over":[166],"100":[167],"times":[168],"larger.":[169]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-18T00:00:00"}
