{"id":"https://openalex.org/W7160890222","doi":"https://doi.org/10.48550/arxiv.2605.10722","title":"On Improving Graph Neural Networks for QSAR by Pre-training on Extended-Connectivity Fingerprints","display_name":"On Improving Graph Neural Networks for QSAR by Pre-training on Extended-Connectivity Fingerprints","publication_year":2026,"publication_date":"2026-05-11","ids":{"openalex":"https://openalex.org/W7160890222","doi":"https://doi.org/10.48550/arxiv.2605.10722"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.10722","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10722","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.10722","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061559592","display_name":"Sam Money-Kyrle","orcid":"https://orcid.org/0000-0002-7683-4226"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Money-Kyrle, Sam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135931312","display_name":"Markus Dablander","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dablander, Markus","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082247372","display_name":"Thierry Hanser","orcid":"https://orcid.org/0000-0002-0392-6154"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hanser, Thierry","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041238648","display_name":"St\u00e9phane Werner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Werner, Stephane","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130765970","display_name":"Charlotte M. Deane","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deane, Charlotte M.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5011245839","display_name":"Garrett M. Morris","orcid":"https://orcid.org/0000-0003-1731-8405"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Morris, Garrett M.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9221000075340271,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9221000075340271,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.04479999840259552,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10887","display_name":"Bioinformatics and Genomic Networks","score":0.007400000002235174,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantitative-structure\u2013activity-relationship","display_name":"Quantitative structure\u2013activity relationship","score":0.8170999884605408},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.5949000120162964},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.47519999742507935},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4325000047683716},{"id":"https://openalex.org/keywords/deep-neural-networks","display_name":"Deep neural networks","score":0.4187999963760376},{"id":"https://openalex.org/keywords/applicability-domain","display_name":"Applicability domain","score":0.3303999900817871}],"concepts":[{"id":"https://openalex.org/C164126121","wikidata":"https://www.wikidata.org/wiki/Q766383","display_name":"Quantitative structure\u2013activity relationship","level":2,"score":0.8170999884605408},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6589000225067139},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.5949000120162964},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5701000094413757},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5217000246047974},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.47519999742507935},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4325000047683716},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.4187999963760376},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4147000014781952},{"id":"https://openalex.org/C107908354","wikidata":"https://www.wikidata.org/wiki/Q4781456","display_name":"Applicability domain","level":3,"score":0.3303999900817871},{"id":"https://openalex.org/C169903167","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Test set","level":2,"score":0.3100000023841858},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.27250000834465027},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.26170000433921814}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.10722","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10722","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.10722","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10722","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5504409074783325,"display_name":"Good health and well-being","id":"https://metadata.un.org/sdg/3"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Molecular":[0],"Graph":[1],"Neural":[2],"Networks":[3],"(GNNs)":[4],"are":[5],"increasingly":[6],"common":[7],"in":[8,71,101],"drug":[9],"discovery,":[10],"particularly":[11],"for":[12,34,37,85],"Quantitative":[13],"Structure-Activity":[14],"Relationship":[15],"(QSAR)":[16],"studies;":[17],"yet,":[18],"their":[19],"superiority":[20],"compared":[21],"to":[22,41],"classical":[23],"molecular":[24],"featurisation":[25],"approaches":[26],"is":[27],"disputed.":[28],"We":[29,46],"report":[30],"a":[31,67,141],"general":[32],"strategy":[33],"improving":[35],"GNNs":[36,99],"QSAR":[38,147],"by":[39],"pre-training":[40,114,123,134],"predict":[42],"Extended-Connectivity":[43],"Fingerprints":[44],"(ECFP).":[45],"validate":[47],"our":[48,129],"approach":[49],"with":[50],"statistical":[51],"tests":[52],"and":[53,89],"challenging":[54],"out-of-distribution":[55],"(OOD)":[56],"splits.":[57],"Across":[58],"five":[59],"out":[60],"of":[61,109,144],"six":[62],"Biogen":[63],"benchmarks,":[64],"we":[65,105,119],"observed":[66],"statistically":[68],"significant":[69],"improvement":[70],"standard":[72],"performance":[73,139],"metrics":[74],"over":[75],"all":[76],"evaluated":[77],"baselines":[78],"when":[79],"using":[80],"ECFP":[81],"pre-trained":[82,98],"GNNs.":[83],"However,":[84],"more":[86,90],"heterogeneous":[87],"datasets":[88],"complex":[91],"endpoints,":[92],"such":[93],"as":[94],"binding":[95],"affinity":[96],"prediction,":[97],"underperformed":[100],"OOD":[102,138],"settings.":[103],"Importantly,":[104],"investigated":[106],"the":[107],"impact":[108],"substructure-level":[110],"data":[111],"leakage":[112],"during":[113],"on":[115,124,140],"downstream":[116,137],"performance.":[117],"While":[118],"identified":[120],"scenarios":[121],"where":[122],"ECFPs":[125],"was":[126],"less":[127],"effective,":[128],"findings":[130],"show":[131],"that":[132],"ECFP-based":[133],"can":[135],"enhance":[136],"diverse":[142],"set":[143],"practically":[145],"relevant":[146],"tasks.":[148]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
