{"id":"https://openalex.org/W4384661704","doi":"https://doi.org/10.1088/2632-2153/ace878","title":"Standardizing chemical compounds with language models","display_name":"Standardizing chemical compounds with language models","publication_year":2023,"publication_date":"2023-07-18","ids":{"openalex":"https://openalex.org/W4384661704","doi":"https://doi.org/10.1088/2632-2153/ace878"},"language":"en","primary_location":{"id":"doi:10.1088/2632-2153/ace878","is_oa":true,"landing_page_url":"https://doi.org/10.1088/2632-2153/ace878","pdf_url":"https://iopscience.iop.org/article/10.1088/2632-2153/ace878/pdf","source":{"id":"https://openalex.org/S4210200687","display_name":"Machine Learning Science and Technology","issn_l":"2632-2153","issn":["2632-2153"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320083","host_organization_name":"IOP Publishing","host_organization_lineage":["https://openalex.org/P4310320083","https://openalex.org/P4310311669"],"host_organization_lineage_names":["IOP Publishing","Institute of Physics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning: Science and Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://iopscience.iop.org/article/10.1088/2632-2153/ace878/pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040420600","display_name":"Miruna T. Cretu","orcid":"https://orcid.org/0000-0002-2192-5091"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":true,"raw_author_name":"Miruna T Cretu","raw_affiliation_strings":["IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND"],"raw_orcid":"https://orcid.org/0000-0002-2192-5091","affiliations":[{"raw_affiliation_string":"IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068553985","display_name":"Alessandra Toniato","orcid":"https://orcid.org/0000-0002-5218-8653"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Alessandra Toniato","raw_affiliation_strings":["IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND"],"raw_orcid":"https://orcid.org/0000-0002-5218-8653","affiliations":[{"raw_affiliation_string":"IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021552278","display_name":"Amol Thakkar","orcid":"https://orcid.org/0000-0003-0403-4067"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Amol Thakkar","raw_affiliation_strings":["IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND"],"raw_orcid":"https://orcid.org/0000-0003-0403-4067","affiliations":[{"raw_affiliation_string":"IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065418363","display_name":"Amin Debabeche","orcid":null},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Amin A Debabeche","raw_affiliation_strings":["IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080069398","display_name":"Teodoro Laino","orcid":"https://orcid.org/0000-0001-8717-0456"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Teodoro Laino","raw_affiliation_strings":["IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND"],"raw_orcid":"https://orcid.org/0000-0001-8717-0456","affiliations":[{"raw_affiliation_string":"IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND","institution_ids":["https://openalex.org/I4210126328"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088744199","display_name":"Alain C. Vaucher","orcid":"https://orcid.org/0000-0001-7554-0288"},"institutions":[{"id":"https://openalex.org/I4210126328","display_name":"IBM Research - Zurich","ror":"https://ror.org/02js37d36","country_code":"CH","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210126328"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Alain C Vaucher","raw_affiliation_strings":["IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND"],"raw_orcid":"https://orcid.org/0000-0001-7554-0288","affiliations":[{"raw_affiliation_string":"IBM Research Europe, S\u00e4umerstrasse 4, R\u00fcschlikon, 8803, SWITZERLAND","institution_ids":["https://openalex.org/I4210126328"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5040420600"],"corresponding_institution_ids":["https://openalex.org/I4210126328"],"apc_list":{"value":1600,"currency":"GBP","value_usd":1962},"apc_paid":{"value":1600,"currency":"GBP","value_usd":1962},"fwci":0.2772,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.45667541,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"4","issue":"3","first_page":"035014","last_page":"035014"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10911","display_name":"Chemical Synthesis and Analysis","score":0.9847000241279602,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/standardization","display_name":"Standardization","score":0.9569816589355469},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7160084247589111},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5212541222572327},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.47965434193611145},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.40834498405456543},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.33712831139564514}],"concepts":[{"id":"https://openalex.org/C188087704","wikidata":"https://www.wikidata.org/wiki/Q369577","display_name":"Standardization","level":2,"score":0.9569816589355469},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7160084247589111},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5212541222572327},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.47965434193611145},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.40834498405456543},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33712831139564514},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1088/2632-2153/ace878","is_oa":true,"landing_page_url":"https://doi.org/10.1088/2632-2153/ace878","pdf_url":"https://iopscience.iop.org/article/10.1088/2632-2153/ace878/pdf","source":{"id":"https://openalex.org/S4210200687","display_name":"Machine Learning Science and Technology","issn_l":"2632-2153","issn":["2632-2153"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320083","host_organization_name":"IOP Publishing","host_organization_lineage":["https://openalex.org/P4310320083","https://openalex.org/P4310311669"],"host_organization_lineage_names":["IOP Publishing","Institute of Physics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning: Science and Technology","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:940a57d4fb2c4fa6b80bc83dd1673912","is_oa":true,"landing_page_url":"https://doaj.org/article/940a57d4fb2c4fa6b80bc83dd1673912","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Machine Learning: Science and Technology, Vol 4, Iss 3, p 035014 (2023)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1088/2632-2153/ace878","is_oa":true,"landing_page_url":"https://doi.org/10.1088/2632-2153/ace878","pdf_url":"https://iopscience.iop.org/article/10.1088/2632-2153/ace878/pdf","source":{"id":"https://openalex.org/S4210200687","display_name":"Machine Learning Science and Technology","issn_l":"2632-2153","issn":["2632-2153"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320083","host_organization_name":"IOP Publishing","host_organization_lineage":["https://openalex.org/P4310320083","https://openalex.org/P4310311669"],"host_organization_lineage_names":["IOP Publishing","Institute of Physics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning: Science and Technology","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5299999713897705}],"awards":[{"id":"https://openalex.org/G942768626","display_name":null,"funder_award_id":"NCCR-Catalysis 180544","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"}],"funders":[{"id":"https://openalex.org/F4320320924","display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung","ror":"https://ror.org/00yjd3n13"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4384661704.pdf"},"referenced_works_count":36,"referenced_works":["https://openalex.org/W1975147762","https://openalex.org/W2008732224","https://openalex.org/W2016455326","https://openalex.org/W2038702914","https://openalex.org/W2046336719","https://openalex.org/W2051810242","https://openalex.org/W2057069496","https://openalex.org/W2102110064","https://openalex.org/W2114940395","https://openalex.org/W2151697120","https://openalex.org/W2189911347","https://openalex.org/W2209568211","https://openalex.org/W2578240541","https://openalex.org/W2580919858","https://openalex.org/W2747592475","https://openalex.org/W2887137963","https://openalex.org/W2887459817","https://openalex.org/W2903262661","https://openalex.org/W2947423323","https://openalex.org/W2984110950","https://openalex.org/W3002554271","https://openalex.org/W3010145447","https://openalex.org/W3038823641","https://openalex.org/W3082081167","https://openalex.org/W3087318293","https://openalex.org/W3093485337","https://openalex.org/W3103092523","https://openalex.org/W3118349318","https://openalex.org/W3133536333","https://openalex.org/W3137928831","https://openalex.org/W3146384714","https://openalex.org/W3194082573","https://openalex.org/W4226345167","https://openalex.org/W4307468223","https://openalex.org/W4385245566","https://openalex.org/W6874847716"],"related_works":["https://openalex.org/W2961085424","https://openalex.org/W4306674287","https://openalex.org/W3046775127","https://openalex.org/W3107602296","https://openalex.org/W3170094116","https://openalex.org/W4386462264","https://openalex.org/W4313488044","https://openalex.org/W3209574120","https://openalex.org/W4312192474","https://openalex.org/W4210805261"],"abstract_inverted_index":{"Abstract":[0],"With":[1],"the":[2,24,65,145,172,186,193],"growing":[3],"amount":[4],"of":[5,26,59,139,154,170,195],"chemical":[6,16],"data":[7],"stored":[8],"digitally,":[9],"it":[10],"has":[11],"become":[12],"crucial":[13],"to":[14,91,121,167,179,188,192],"represent":[15],"compounds":[17,124],"accurately":[18],"and":[19,31,191,203],"consistently.":[20],"Harmonized":[21],"representations":[22,41,169],"facilitate":[23],"extraction":[25],"insightful":[27],"information":[28],"from":[29,165],"datasets,":[30,43],"are":[32,89],"advantageous":[33],"for":[34,69],"machine":[35],"learning":[36,108,119],"applications.":[37],"To":[38],"achieve":[39],"consistent":[40],"throughout":[42],"one":[44],"relies":[45],"on":[46,79,135,201],"molecule":[47],"standardization,":[48],"which":[49,126],"is":[50,128],"typically":[51],"accomplished":[52],"using":[53],"rule-based":[54,111],"algorithms":[55],"that":[56,88],"modify":[57],"descriptions":[58],"functional":[60],"groups.":[61],"Here,":[62],"we":[63,184],"present":[64],"first":[66],"deep-learning":[67],"model":[68,96,143],"molecular":[70,148],"standardization.":[71],"We":[72,114],"enable":[73],"custom":[74],"standardization":[75,86,112,132,157],"schemes":[76],"based":[77,134,200],"solely":[78],"data,":[80],"which,":[81],"as":[82],"additional":[83],"benefit,":[84],"support":[85],"options":[87],"difficult":[90],"encode":[92],"into":[93],"rules.":[94],"Our":[95],"achieves":[97],"over":[98],"<mml:math":[99],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[100],"overflow=\"scroll\">":[101],"<mml:mn>98</mml:mn>":[102],"<mml:mi":[103],"mathvariant=\"normal\">%</mml:mi>":[104],"</mml:math>":[105],"accuracy":[106,153],"in":[107,198],"two":[109],"popular":[110],"protocols.":[113],"then":[115],"follow":[116],"a":[117,136,151,163],"transfer":[118],"approach":[120],"standardize":[122],"metal-organic":[123],"(for":[125],"there":[127],"currently":[129],"no":[130],"automated":[131],"practice),":[133],"human-curated":[137],"dataset":[138],"1512":[140],"compounds.":[141],"This":[142],"predicts":[144],"expected":[146],"standardized":[147],"format":[149],"with":[150],"test":[152],"80.7%.":[155],"As":[156],"can":[158,176],"be":[159,177],"considered,":[160],"more":[161],"broadly,":[162],"transformation":[164],"undesired":[166],"desired":[168],"compounds,":[171],"same":[173],"data-driven":[174],"architecture":[175],"applied":[178],"other":[180],"tasks.":[181],"For":[182],"instance,":[183],"demonstrate":[185],"application":[187],"compound":[189],"canonicalization":[190],"determination":[194],"major":[196],"tautomers":[197],"solution,":[199],"computed":[202],"experimental":[204],"data.":[205]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2025-10-10T00:00:00"}
