{"id":"https://openalex.org/W4411616203","doi":"https://doi.org/10.1021/acs.jcim.5c00665","title":"Structural Bias in Three-Dimensional Autoregressive Generative Machine Learning of Organic Molecules","display_name":"Structural Bias in Three-Dimensional Autoregressive Generative Machine Learning of Organic Molecules","publication_year":2025,"publication_date":"2025-06-25","ids":{"openalex":"https://openalex.org/W4411616203","doi":"https://doi.org/10.1021/acs.jcim.5c00665","pmid":"https://pubmed.ncbi.nlm.nih.gov/40556385"},"language":"en","primary_location":{"id":"doi:10.1021/acs.jcim.5c00665","is_oa":true,"landing_page_url":"https://doi.org/10.1021/acs.jcim.5c00665","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1021/acs.jcim.5c00665","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034071688","display_name":"Zsuzsanna Koczor-Benda","orcid":"https://orcid.org/0000-0002-6714-0337"},"institutions":[{"id":"https://openalex.org/I39555362","display_name":"University of Warwick","ror":"https://ror.org/01a77tt86","country_code":"GB","type":"education","lineage":["https://openalex.org/I39555362"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Zsuzsanna Koczor-Benda","raw_affiliation_strings":["Department of Chemistry","University of Warwick"],"affiliations":[{"raw_affiliation_string":"Department of Chemistry","institution_ids":[]},{"raw_affiliation_string":"University of Warwick","institution_ids":["https://openalex.org/I39555362"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078107823","display_name":"Joe Gilkes","orcid":"https://orcid.org/0000-0002-1864-4571"},"institutions":[{"id":"https://openalex.org/I39555362","display_name":"University of Warwick","ror":"https://ror.org/01a77tt86","country_code":"GB","type":"education","lineage":["https://openalex.org/I39555362"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Joe Gilkes","raw_affiliation_strings":["Centre for Doctoral Training in Modelling of Heterogeneous Systems","Department of Chemistry","University of Warwick"],"affiliations":[{"raw_affiliation_string":"Centre for Doctoral Training in Modelling of Heterogeneous Systems","institution_ids":[]},{"raw_affiliation_string":"Department of Chemistry","institution_ids":[]},{"raw_affiliation_string":"University of Warwick","institution_ids":["https://openalex.org/I39555362"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5118619586","display_name":"Francesco Bartucca","orcid":null},"institutions":[{"id":"https://openalex.org/I39555362","display_name":"University of Warwick","ror":"https://ror.org/01a77tt86","country_code":"GB","type":"education","lineage":["https://openalex.org/I39555362"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Francesco Bartucca","raw_affiliation_strings":["Department of Chemistry","University of Warwick"],"affiliations":[{"raw_affiliation_string":"Department of Chemistry","institution_ids":[]},{"raw_affiliation_string":"University of Warwick","institution_ids":["https://openalex.org/I39555362"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5118619587","display_name":"Abdulla Al-Fekaiki","orcid":null},"institutions":[{"id":"https://openalex.org/I39555362","display_name":"University of Warwick","ror":"https://ror.org/01a77tt86","country_code":"GB","type":"education","lineage":["https://openalex.org/I39555362"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Abdulla Al-Fekaiki","raw_affiliation_strings":["Department of Chemistry","University of Warwick"],"affiliations":[{"raw_affiliation_string":"Department of Chemistry","institution_ids":[]},{"raw_affiliation_string":"University of Warwick","institution_ids":["https://openalex.org/I39555362"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016855366","display_name":"Reinhard J. Maurer","orcid":"https://orcid.org/0000-0002-3004-785X"},"institutions":[{"id":"https://openalex.org/I39555362","display_name":"University of Warwick","ror":"https://ror.org/01a77tt86","country_code":"GB","type":"education","lineage":["https://openalex.org/I39555362"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Reinhard J. Maurer","raw_affiliation_strings":["Department of Chemistry","Department of Physics","University of Warwick"],"affiliations":[{"raw_affiliation_string":"Department of Chemistry","institution_ids":[]},{"raw_affiliation_string":"Department of Physics","institution_ids":[]},{"raw_affiliation_string":"University of Warwick","institution_ids":["https://openalex.org/I39555362"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5016855366"],"corresponding_institution_ids":["https://openalex.org/I39555362"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10300493,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"65","issue":"13","first_page":"6644","last_page":"6654"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10044","display_name":"Protein Structure and Dynamics","score":0.9232000112533569,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.7815759181976318},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.6566447019577026},{"id":"https://openalex.org/keywords/organic-molecules","display_name":"Organic molecules","score":0.5620236992835999},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5354889035224915},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.47133827209472656},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4602159261703491},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.43895241618156433},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3675462603569031},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.36163246631622314},{"id":"https://openalex.org/keywords/biological-system","display_name":"Biological system","score":0.3478734493255615},{"id":"https://openalex.org/keywords/molecule","display_name":"Molecule","score":0.3408603072166443},{"id":"https://openalex.org/keywords/econometrics","display_name":"Econometrics","score":0.3275200128555298},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.24826249480247498},{"id":"https://openalex.org/keywords/organic-chemistry","display_name":"Organic chemistry","score":0.14350038766860962},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.09122243523597717}],"concepts":[{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.7815759181976318},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6566447019577026},{"id":"https://openalex.org/C2984497647","wikidata":"https://www.wikidata.org/wiki/Q174211","display_name":"Organic molecules","level":3,"score":0.5620236992835999},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5354889035224915},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.47133827209472656},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4602159261703491},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.43895241618156433},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3675462603569031},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.36163246631622314},{"id":"https://openalex.org/C186060115","wikidata":"https://www.wikidata.org/wiki/Q30336093","display_name":"Biological system","level":1,"score":0.3478734493255615},{"id":"https://openalex.org/C32909587","wikidata":"https://www.wikidata.org/wiki/Q11369","display_name":"Molecule","level":2,"score":0.3408603072166443},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.3275200128555298},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.24826249480247498},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.14350038766860962},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.09122243523597717}],"mesh":[{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D009930","descriptor_name":"Organic Chemicals","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":true},{"descriptor_ui":"D009930","descriptor_name":"Organic Chemicals","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":true},{"descriptor_ui":"D009930","descriptor_name":"Organic Chemicals","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":true},{"descriptor_ui":"D015394","descriptor_name":"Molecular Structure","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D015394","descriptor_name":"Molecular Structure","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D015394","descriptor_name":"Molecular Structure","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false}],"locations_count":3,"locations":[{"id":"doi:10.1021/acs.jcim.5c00665","is_oa":true,"landing_page_url":"https://doi.org/10.1021/acs.jcim.5c00665","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},{"id":"pmid:40556385","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40556385","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of chemical information and modeling","raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:12264931","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/12264931","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"J Chem Inf Model","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1021/acs.jcim.5c00665","is_oa":true,"landing_page_url":"https://doi.org/10.1021/acs.jcim.5c00665","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1194063999","display_name":null,"funder_award_id":"MR/X023109/1","funder_id":"https://openalex.org/F4320314731","funder_display_name":"UK Research and Innovation"},{"id":"https://openalex.org/G2387566524","display_name":"Kelvin-2","funder_award_id":"EP/T022175/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G3985384870","display_name":"EPSRC Centre for Doctoral Training in Modelling of Heterogeneous Systems","funder_award_id":"EP/S022848/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G4575921620","display_name":"Proposal for a Tier 2 Centre - HPC Midlands Plus","funder_award_id":"EP/P020232/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G6951721839","display_name":"Deep learning enabled simulation of plasmonic photocatalysis","funder_award_id":"EP/X014088/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320314731","display_name":"UK Research and Innovation","ror":"https://ror.org/001aqnf71"},{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1975147762","https://openalex.org/W2029413789","https://openalex.org/W2080635178","https://openalex.org/W2114704115","https://openalex.org/W2141245797","https://openalex.org/W2169678694","https://openalex.org/W2529996553","https://openalex.org/W2736137960","https://openalex.org/W2740924709","https://openalex.org/W2778051509","https://openalex.org/W2805002767","https://openalex.org/W2945551948","https://openalex.org/W2976720228","https://openalex.org/W3007517495","https://openalex.org/W3104644561","https://openalex.org/W3126679164","https://openalex.org/W3137185865","https://openalex.org/W3174875816","https://openalex.org/W3201249640","https://openalex.org/W3205590463","https://openalex.org/W3213217061","https://openalex.org/W3215172065","https://openalex.org/W3216666478","https://openalex.org/W4220798957","https://openalex.org/W4284898351","https://openalex.org/W4308696465","https://openalex.org/W4319310661","https://openalex.org/W4320005814","https://openalex.org/W4365442601","https://openalex.org/W4389305212","https://openalex.org/W4407550205","https://openalex.org/W6910477534"],"related_works":["https://openalex.org/W4365211920","https://openalex.org/W3014948380","https://openalex.org/W4391584540","https://openalex.org/W4380551139","https://openalex.org/W4317695495","https://openalex.org/W4395044357","https://openalex.org/W4287117424","https://openalex.org/W4387506531","https://openalex.org/W2087346071","https://openalex.org/W2967848559"],"abstract_inverted_index":{"A":[0],"range":[1],"of":[2,10,40,67,97,113,118,142,151,172,259],"generative":[3,41,100],"machine":[4,101],"learning":[5,102],"models":[6,42,227,235],"for":[7,30,272],"the":[8,65,75,84,89,95,98,107,125,135,152,158,170,175,199,221,230,234,248,257,264],"design":[9],"novel":[11],"molecules":[12,181,194],"and":[13,54,70,110,129,138,144,188,209,236,240],"materials":[14],"have":[15],"been":[16],"proposed":[17],"in":[18,74,198,233],"recent":[19],"years.":[20],"Models":[21],"that":[22,79,157,165,182],"can":[23,216,228],"generate":[24],"three-dimensional":[25],"structures":[26],"are":[27,195],"particularly":[28],"suitable":[29],"quantum":[31],"chemistry":[32],"workflows,":[33],"enabling":[34],"direct":[35],"property":[36,111],"prediction.":[37],"The":[38,251],"performance":[39],"is":[43,60,166,268],"typically":[44],"assessed":[45],"based":[46,210],"on":[47,184,211],"their":[48,61],"ability":[49,62,96],"to":[50,63,81,105,161,218],"produce":[51],"novel,":[52],"valid,":[53],"unique":[55],"molecules.":[56,122,146],"However,":[57],"equally":[58],"important":[59],"learn":[64],"prevalence":[66],"functional":[68,120,136,206,273],"groups":[69],"certain":[71],"chemical":[72,85,108,139,153,245,252],"moieties":[73],"underlying":[76],"training":[77,90,114,143,176,239],"data,":[78,242],"is,":[80],"faithfully":[82],"reproduce":[83,106],"space":[86,109,140],"spanned":[87],"by":[88,169],"data.":[91],"Here,":[92],"we":[93,155,254],"investigate":[94,203],"autoregressive":[99],"model":[103,159,222],"G-SchNet":[104],"distributions":[112,258],"data":[115,177,213],"sets":[116],"composed":[117],"large,":[119],"organic":[121],"We":[123,201],"assess":[124],"elemental":[126],"composition,":[127],"size-":[128],"bond-length":[130],"distributions,":[131],"as":[132,134,263],"well":[133],"group":[137,207],"distribution":[141],"generated":[145,241],"By":[147],"principal":[148],"component":[149],"analysis":[150],"space,":[154],"find":[156,255],"leads":[160],"a":[162,269],"biased":[163],"generation":[164,204,223,231],"largely":[167],"unaffected":[168],"choice":[171],"hyperparameters":[173],"or":[174],"set":[178],"distribution,":[179],"producing":[180],"are,":[183],"average,":[185],"less":[186],"saturated":[187],"contain":[189],"more":[190],"heteroatoms.":[191],"Purely":[192],"aliphatic":[193],"mostly":[196],"absent":[197],"generation.":[200],"further":[202],"with":[205],"constraints":[208],"composite":[212],"sets,":[214],"which":[215,267],"help":[217],"partially":[219],"remedy":[220],"bias.":[224],"Decision":[225],"tree":[226],"recognize":[229],"bias":[232],"discriminate":[237],"between":[238,247],"revealing":[243],"key":[244],"differences":[246,253],"two":[249],"sets.":[250],"affect":[256],"electronic":[260],"properties":[261],"such":[262],"HOMO-LUMO":[265],"gap,":[266],"common":[270],"target":[271],"molecule":[274],"design.":[275]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
