{"id":"https://openalex.org/W4301372551","doi":"https://doi.org/10.1021/acs.jcim.2c00715","title":"Large-Scale Distributed Training of Transformers for Chemical Fingerprinting","display_name":"Large-Scale Distributed Training of Transformers for Chemical Fingerprinting","publication_year":2022,"publication_date":"2022-10-04","ids":{"openalex":"https://openalex.org/W4301372551","doi":"https://doi.org/10.1021/acs.jcim.2c00715","pmid":"https://pubmed.ncbi.nlm.nih.gov/36195574"},"language":"en","primary_location":{"id":"doi:10.1021/acs.jcim.2c00715","is_oa":true,"landing_page_url":"https://doi.org/10.1021/acs.jcim.2c00715","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1021/acs.jcim.2c00715","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5021880727","display_name":"Hisham Abdel-Aty","orcid":null},"institutions":[{"id":"https://openalex.org/I47508984","display_name":"Imperial College London","ror":"https://ror.org/041kmwe10","country_code":"GB","type":"education","lineage":["https://openalex.org/I47508984"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Hisham Abdel-Aty","raw_affiliation_strings":["Department of Chemistry and Institute of Chemical Biology, Imperial College London, Molecular Sciences Research Hub, Shepherd\u2019s Bush, LondonW12 0BZ, UK","Department of Chemistry and Institute of Chemical Biology, Imperial College London, Molecular Sciences Research Hub, Shepherd's Bush, LondonW12 0BZ, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Chemistry and Institute of Chemical Biology, Imperial College London, Molecular Sciences Research Hub, Shepherd\u2019s Bush, LondonW12 0BZ, UK","institution_ids":["https://openalex.org/I47508984"]},{"raw_affiliation_string":"Department of Chemistry and Institute of Chemical Biology, Imperial College London, Molecular Sciences Research Hub, Shepherd's Bush, LondonW12 0BZ, UK","institution_ids":["https://openalex.org/I47508984"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036071015","display_name":"Ian R. Gould","orcid":"https://orcid.org/0000-0003-3559-0234"},"institutions":[{"id":"https://openalex.org/I47508984","display_name":"Imperial College London","ror":"https://ror.org/041kmwe10","country_code":"GB","type":"education","lineage":["https://openalex.org/I47508984"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Ian R. Gould","raw_affiliation_strings":["Department of Chemistry and Institute of Chemical Biology, Imperial College London, Molecular Sciences Research Hub, Shepherd\u2019s Bush, LondonW12 0BZ, UK","Department of Chemistry and Institute of Chemical Biology, Imperial College London, Molecular Sciences Research Hub, Shepherd's Bush, LondonW12 0BZ, UK"],"raw_orcid":"https://orcid.org/0000-0003-3559-0234","affiliations":[{"raw_affiliation_string":"Department of Chemistry and Institute of Chemical Biology, Imperial College London, Molecular Sciences Research Hub, Shepherd\u2019s Bush, LondonW12 0BZ, UK","institution_ids":["https://openalex.org/I47508984"]},{"raw_affiliation_string":"Department of Chemistry and Institute of Chemical Biology, Imperial College London, Molecular Sciences Research Hub, Shepherd's Bush, LondonW12 0BZ, UK","institution_ids":["https://openalex.org/I47508984"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5036071015"],"corresponding_institution_ids":["https://openalex.org/I47508984"],"apc_list":null,"apc_paid":null,"fwci":3.922,"has_fulltext":false,"cited_by_count":31,"citation_normalized_percentile":{"value":0.94389703,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":"62","issue":"20","first_page":"4852","last_page":"4862"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10836","display_name":"Metabolomics and Mass Spectrometry Studies","score":0.9648000001907349,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.797882080078125},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6493606567382812},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.567345142364502},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.5173625349998474},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.46417236328125},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.44535213708877563},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.44270944595336914},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43610039353370667},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.0800618827342987}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.797882080078125},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6493606567382812},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.567345142364502},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.5173625349998474},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.46417236328125},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.44535213708877563},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.44270944595336914},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43610039353370667},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0800618827342987},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D019985","descriptor_name":"Benchmarking","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true}],"locations_count":3,"locations":[{"id":"doi:10.1021/acs.jcim.2c00715","is_oa":true,"landing_page_url":"https://doi.org/10.1021/acs.jcim.2c00715","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},{"id":"pmid:36195574","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/36195574","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of chemical information and modeling","raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:9597661","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/9597661","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"J Chem Inf Model","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1021/acs.jcim.2c00715","is_oa":true,"landing_page_url":"https://doi.org/10.1021/acs.jcim.2c00715","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6800000071525574,"id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G6895843725","display_name":"DTP 2018-19 Imperial College London","funder_award_id":"EP/R513052/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W142212369","https://openalex.org/W1757990252","https://openalex.org/W1975147762","https://openalex.org/W1988037271","https://openalex.org/W2039749139","https://openalex.org/W2096541451","https://openalex.org/W2114704115","https://openalex.org/W2119512897","https://openalex.org/W2153693853","https://openalex.org/W2290847742","https://openalex.org/W2406943157","https://openalex.org/W2461620095","https://openalex.org/W2594183968","https://openalex.org/W2735246657","https://openalex.org/W2769423117","https://openalex.org/W2901476322","https://openalex.org/W2925830236","https://openalex.org/W2947423323","https://openalex.org/W2962784628","https://openalex.org/W2963250244","https://openalex.org/W2970641574","https://openalex.org/W2979826702","https://openalex.org/W2987091515","https://openalex.org/W3097145107","https://openalex.org/W3100157108","https://openalex.org/W3103092523","https://openalex.org/W3188589934"],"related_works":["https://openalex.org/W4386014872","https://openalex.org/W1847536016","https://openalex.org/W4361193986","https://openalex.org/W3149094754","https://openalex.org/W2148703997","https://openalex.org/W4366851046","https://openalex.org/W3172681236","https://openalex.org/W2033371749","https://openalex.org/W3214032513","https://openalex.org/W4380568733"],"abstract_inverted_index":{"Transformer":[0],"models":[1,36,55],"have":[2,20],"become":[3],"a":[4,64,93,111,139],"popular":[5],"choice":[6],"for":[7,25,60,73,115],"various":[8],"machine":[9],"learning":[10],"tasks":[11],"due":[12],"to":[13,45,91,128,151],"their":[14,135],"often":[15],"outstanding":[16],"performance.":[17],"Recently,":[18],"transformers":[19],"been":[21],"used":[22],"in":[23,63,103],"chemistry":[24,74],"classifying":[26],"reactions,":[27],"reaction":[28],"prediction,":[29,32],"physiochemical":[30],"property":[31],"and":[33,42,106,133],"more.":[34],"These":[35],"require":[37],"huge":[38],"amounts":[39],"of":[40,98],"data":[41],"localized":[43],"compute":[44],"train":[46],"effectively.":[47],"In":[48],"this":[49],"work,":[50],"we":[51],"demonstrate":[52],"that":[53],"these":[54],"can":[56],"successfully":[57],"be":[58],"trained":[59],"chemical":[61,104],"problems":[62],"distributed":[65,89],"manner":[66],"across":[67],"many":[68],"computers\u2500a":[69],"more":[70,125,130],"common":[71],"scenario":[72],"institutions.":[75],"We":[76,87,118,137],"introduce":[77],"MFBERT:":[78],"Molecular":[79],"Fingerprints":[80],"through":[81],"Bidirectional":[82],"Encoder":[83],"Representations":[84],"from":[85,147],"Transformers.":[86],"use":[88],"computing":[90],"pre-train":[92],"transformer":[94],"model":[95,122],"on":[96,110,123],"one":[97],"the":[99,144],"largest":[100],"aggregate":[101],"datasets":[102,127],"literature":[105],"achieve":[107],"state-of-the-art":[108],"scores":[109],"virtual":[112],"screening":[113],"benchmark":[114],"molecular":[116,149,152],"fingerprints.":[117],"then":[119],"fine-tune":[120],"our":[121],"smaller,":[124],"specific":[126],"generate":[129],"targeted":[131],"fingerprints":[132,153],"assess":[134],"quality.":[136],"utilize":[138],"SentencePiece":[140],"tokenization":[141,159],"model,":[142],"where":[143],"whole":[145],"procedure":[146],"raw":[148],"representation":[150],"becomes":[154],"data-driven,":[155],"with":[156],"no":[157],"explicit":[158],"rules.":[160]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":9},{"year":2024,"cited_by_count":8},{"year":2023,"cited_by_count":8},{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
