{"id":"https://openalex.org/W7160953306","doi":"https://doi.org/10.48550/arxiv.2605.09949","title":"From Syntax to Semantics: Unveiling the Emergence of Chirality in SMILES Translation Models","display_name":"From Syntax to Semantics: Unveiling the Emergence of Chirality in SMILES Translation Models","publication_year":2026,"publication_date":"2026-05-11","ids":{"openalex":"https://openalex.org/W7160953306","doi":"https://doi.org/10.48550/arxiv.2605.09949"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.09949","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09949","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.09949","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135934396","display_name":"Zehao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zehao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004040969","display_name":"Yasuhiro Yoshikai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yoshikai, Yasuhiro","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079497729","display_name":"Shumpei Nemoto","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nemoto, Shumpei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061014418","display_name":"Hiroyuki Kusuhara","orcid":"https://orcid.org/0000-0002-3641-8746"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kusuhara, Hiroyuki","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5002472002","display_name":"Tadahaya Mizuno","orcid":"https://orcid.org/0000-0002-1638-602X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mizuno, Tadahaya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.5044999718666077,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.5044999718666077,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.13089999556541443,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11445","display_name":"Origins and Evolution of Life","score":0.08919999748468399,"subfield":{"id":"https://openalex.org/subfields/3103","display_name":"Astronomy and Astrophysics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5569000244140625},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.41940000653266907},{"id":"https://openalex.org/keywords/chirality","display_name":"Chirality (physics)","score":0.40639999508857727},{"id":"https://openalex.org/keywords/string","display_name":"String (physics)","score":0.3555000126361847},{"id":"https://openalex.org/keywords/meaning","display_name":"Meaning (existential)","score":0.34779998660087585},{"id":"https://openalex.org/keywords/substitution","display_name":"Substitution (logic)","score":0.33889999985694885},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.3352000117301941}],"concepts":[{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5569000244140625},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5286999940872192},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48579999804496765},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4650999903678894},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.41940000653266907},{"id":"https://openalex.org/C124668440","wikidata":"https://www.wikidata.org/wiki/Q2305173","display_name":"Chirality (physics)","level":5,"score":0.40639999508857727},{"id":"https://openalex.org/C157486923","wikidata":"https://www.wikidata.org/wiki/Q1376436","display_name":"String (physics)","level":2,"score":0.3555000126361847},{"id":"https://openalex.org/C2780876879","wikidata":"https://www.wikidata.org/wiki/Q3054749","display_name":"Meaning (existential)","level":2,"score":0.34779998660087585},{"id":"https://openalex.org/C2778220771","wikidata":"https://www.wikidata.org/wiki/Q1522579","display_name":"Substitution (logic)","level":2,"score":0.33889999985694885},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.3352000117301941},{"id":"https://openalex.org/C191795146","wikidata":"https://www.wikidata.org/wiki/Q3878446","display_name":"Norm (philosophy)","level":2,"score":0.3190999925136566},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.3107999861240387},{"id":"https://openalex.org/C60048249","wikidata":"https://www.wikidata.org/wiki/Q37437","display_name":"Syntax","level":2,"score":0.296999990940094},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.29510000348091125},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.29280000925064087},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.28940001130104065},{"id":"https://openalex.org/C486523","wikidata":"https://www.wikidata.org/wiki/Q494483","display_name":"Enantiomer","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C161301231","wikidata":"https://www.wikidata.org/wiki/Q3478658","display_name":"Knowledge representation and reasoning","level":2,"score":0.2549999952316284}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.09949","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09949","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.09949","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09949","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.8302263021469116,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Understanding":[0],"how":[1,81],"chemical":[2,7,24,234],"language":[3],"models":[4,70],"(CLMs)":[5],"learn":[6],"meaning":[8],"from":[9],"molecular":[10,170],"string":[11,17],"representations,":[12],"rather":[13],"than":[14],"only":[15],"surface-level":[16],"patterns,":[18],"is":[19,84,113],"an":[20,138],"important":[21],"question":[22],"in":[23,42,98,141,157,172,204,228],"representation":[25,235],"learning":[26,29,111],"and":[27,45,74,120,134,148,155,160,186],"machine":[28],"for":[30,71,222,232],"chemistry.":[31],"Chirality":[32],"provides":[33],"a":[34,64,95,105,152,165,191,218],"demanding":[35],"test":[36],"case:":[37],"enantiomers":[38],"can":[39,215],"differ":[40],"greatly":[41],"pharmacological":[43],"activity":[44],"toxicity,":[46],"yet":[47],"CLMs":[48],"often":[49],"struggle":[50],"to":[51,79],"distinguish":[52],"chiral":[53,82,110,126,169],"configurations":[54],"reliably.":[55],"Here":[56],"we":[57,93],"present":[58],"Pan-CORE":[59,91],"(Pan-Chemical":[60],"Omniscale":[61],"Representation":[62],"Engine),":[63],"family":[65],"of":[66,125,129,168,183,194,225],"autoregressive":[67],"Transformer-based":[68],"encoder-decoder":[69],"SMILES":[72,213],"translation,":[73],"use":[75],"high-temporal-resolution":[76],"checkpoint":[77],"analysis":[78,224],"investigate":[80],"information":[83],"learned":[85],"during":[86],"training.":[87],"Across":[88],"all":[89],"tested":[90],"variants,":[92],"observe":[94],"reproducible":[96],"jump-up":[97],"which":[99,142],"chiral-token":[100,143,201],"accuracy":[101,202],"rises":[102],"abruptly":[103],"after":[104],"long":[106],"plateau,":[107],"suggesting":[108],"that":[109,212],"stagnation":[112],"not":[114],"explained":[115],"by":[116],"model":[117],"capacity":[118],"alone":[119],"instead":[121],"reflects":[122],"the":[123,173,180,184,205],"complexity":[124],"constraints.":[127],"Analyses":[128],"attention":[130],"dynamics,":[131],"residual-stream":[132],"trajectories,":[133],"latent-space":[135],"geometry":[136],"support":[137],"encoder-centered":[139,181],"mechanism":[140],"representations":[144,171],"undergo":[145],"transient":[146],"destabilization":[147],"reconstruction,":[149],"seen":[150],"as":[151,217],"V-shaped":[153],"drop":[154],"recovery":[156],"vector":[158],"norm":[159],"directional":[161],"stability,":[162],"together":[163],"with":[164,230],"clear":[166],"reorganization":[167],"latent":[174],"space.":[175],"Encoder-decoder":[176],"cross-evaluation":[177],"further":[178],"supports":[179],"nature":[182],"transition,":[185],"targeted":[187],"attention-head":[188],"ablation":[189],"identifies":[190],"small":[192],"set":[193],"chiral-sensitive":[195],"heads":[196],"whose":[197],"removal":[198],"selectively":[199],"reduces":[200],"even":[203],"fully":[206],"trained":[207],"model.":[208],"These":[209],"findings":[210],"show":[211],"translation":[214],"serve":[216],"useful":[219],"experimental":[220],"system":[221],"mechanistic":[223],"semantic":[226],"emergence":[227],"CLMs,":[229],"implications":[231],"interpretable":[233],"learning.":[236]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
