{"id":"https://openalex.org/W4391833093","doi":"https://doi.org/10.48550/arxiv.2402.07970","title":"Utilizing Low-Dimensional Molecular Embeddings for Rapid Chemical Similarity Search","display_name":"Utilizing Low-Dimensional Molecular Embeddings for Rapid Chemical Similarity Search","publication_year":2024,"publication_date":"2024-02-12","ids":{"openalex":"https://openalex.org/W4391833093","doi":"https://doi.org/10.48550/arxiv.2402.07970"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2402.07970","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.07970","pdf_url":"https://arxiv.org/pdf/2402.07970","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2402.07970","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5084706907","display_name":"Kathryn E. Kirchoff","orcid":"https://orcid.org/0000-0001-9191-4032"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kirchoff, Kathryn E.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060495006","display_name":"James Wellnitz","orcid":"https://orcid.org/0000-0002-9181-3431"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wellnitz, James","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066018723","display_name":"Joshua E. Hochuli","orcid":"https://orcid.org/0000-0003-4487-3228"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hochuli, Joshua E.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023710351","display_name":"Travis Maxfield","orcid":"https://orcid.org/0000-0002-7105-0246"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Maxfield, Travis","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103088096","display_name":"Konstantin I. Popov","orcid":"https://orcid.org/0000-0002-9394-972X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Popov, Konstantin I.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029081281","display_name":"Shawn M. Gomez","orcid":"https://orcid.org/0000-0002-8251-4552"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gomez, Shawn","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5078536199","display_name":"Alexander Tropsha","orcid":"https://orcid.org/0000-0003-3802-8896"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tropsha, Alexander","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5084706907"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9836999773979187,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.9836999773979187,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12327","display_name":"Various Chemistry Research Topics","score":0.9819999933242798,"subfield":{"id":"https://openalex.org/subfields/1606","display_name":"Physical and Theoretical Chemistry"},"field":{"id":"https://openalex.org/fields/16","display_name":"Chemistry"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10908","display_name":"Analytical Chemistry and Chromatography","score":0.9495000243186951,"subfield":{"id":"https://openalex.org/subfields/1607","display_name":"Spectroscopy"},"field":{"id":"https://openalex.org/fields/16","display_name":"Chemistry"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.7377269268035889},{"id":"https://openalex.org/keywords/chemical-similarity","display_name":"Chemical similarity","score":0.4391757845878601},{"id":"https://openalex.org/keywords/nearest-neighbor-search","display_name":"Nearest neighbor search","score":0.42700162529945374},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.40713173151016235},{"id":"https://openalex.org/keywords/statistical-physics","display_name":"Statistical physics","score":0.32294270396232605},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2945592701435089},{"id":"https://openalex.org/keywords/structural-similarity","display_name":"Structural similarity","score":0.23662149906158447},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.21494576334953308}],"concepts":[{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.7377269268035889},{"id":"https://openalex.org/C24259465","wikidata":"https://www.wikidata.org/wiki/Q2272153","display_name":"Chemical similarity","level":3,"score":0.4391757845878601},{"id":"https://openalex.org/C116738811","wikidata":"https://www.wikidata.org/wiki/Q608751","display_name":"Nearest neighbor search","level":2,"score":0.42700162529945374},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.40713173151016235},{"id":"https://openalex.org/C121864883","wikidata":"https://www.wikidata.org/wiki/Q677916","display_name":"Statistical physics","level":1,"score":0.32294270396232605},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2945592701435089},{"id":"https://openalex.org/C139489369","wikidata":"https://www.wikidata.org/wiki/Q770846","display_name":"Structural similarity","level":2,"score":0.23662149906158447},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.21494576334953308},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2402.07970","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.07970","pdf_url":"https://arxiv.org/pdf/2402.07970","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2402.07970","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2402.07970","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2402.07970","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.07970","pdf_url":"https://arxiv.org/pdf/2402.07970","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1413283406","display_name":null,"funder_award_id":"U01CA238475","funder_id":"https://openalex.org/F4320337351","funder_display_name":"National Cancer Institute"},{"id":"https://openalex.org/G1858569161","display_name":null,"funder_award_id":"T32GM086330","funder_id":"https://openalex.org/F4320332161","funder_display_name":"National Institutes of Health"},{"id":"https://openalex.org/G2293978286","display_name":null,"funder_award_id":"T32GM135122","funder_id":"https://openalex.org/F4320337354","funder_display_name":"National Institute of General Medical Sciences"},{"id":"https://openalex.org/G2671694828","display_name":null,"funder_award_id":"U24DK116204","funder_id":"https://openalex.org/F4320337357","funder_display_name":"National Institute of Diabetes and Digestive and Kidney Diseases"},{"id":"https://openalex.org/G2851989493","display_name":null,"funder_award_id":"U24DK116204","funder_id":"https://openalex.org/F4320332161","funder_display_name":"National Institutes of Health"},{"id":"https://openalex.org/G3219550263","display_name":null,"funder_award_id":"T32GM135122","funder_id":"https://openalex.org/F4320332161","funder_display_name":"National Institutes of Health"},{"id":"https://openalex.org/G3664618867","display_name":null,"funder_award_id":"T32GM086330","funder_id":"https://openalex.org/F4320337354","funder_display_name":"National Institute of General Medical Sciences"},{"id":"https://openalex.org/G5474110026","display_name":null,"funder_award_id":"R01GM140154","funder_id":"https://openalex.org/F4320337354","funder_display_name":"National Institute of General Medical Sciences"},{"id":"https://openalex.org/G6738469628","display_name":null,"funder_award_id":"R01CA233811","funder_id":"https://openalex.org/F4320337351","funder_display_name":"National Cancer Institute"},{"id":"https://openalex.org/G7607605106","display_name":null,"funder_award_id":"R01GM140154","funder_id":"https://openalex.org/F4320332161","funder_display_name":"National Institutes of Health"}],"funders":[{"id":"https://openalex.org/F4320332161","display_name":"National Institutes of Health","ror":"https://ror.org/01cwqze88"},{"id":"https://openalex.org/F4320332600","display_name":"University of North Carolina at Chapel Hill","ror":"https://ror.org/0130frc33"},{"id":"https://openalex.org/F4320337351","display_name":"National Cancer Institute","ror":"https://ror.org/040gcmg81"},{"id":"https://openalex.org/F4320337354","display_name":"National Institute of General Medical Sciences","ror":"https://ror.org/04q48ey07"},{"id":"https://openalex.org/F4320337357","display_name":"National Institute of Diabetes and Digestive and Kidney Diseases","ror":"https://ror.org/00adh9b73"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4391833093.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W1987966112","https://openalex.org/W4292296636","https://openalex.org/W358917076","https://openalex.org/W2384995271","https://openalex.org/W2412446857","https://openalex.org/W2090768198","https://openalex.org/W1576126133","https://openalex.org/W2042007894","https://openalex.org/W4249871115","https://openalex.org/W22579956"],"abstract_inverted_index":{"Nearest":[0],"neighbor-based":[1],"similarity":[2,125,189],"searching":[3,77],"is":[4],"a":[5,30,101,108,140,163,166],"common":[6],"task":[7,27,59],"in":[8,14,44,160],"chemistry,":[9],"with":[10,92],"notable":[11],"use":[12],"cases":[13],"drug":[15],"discovery.":[16],"Yet,":[17],"some":[18],"of":[19,50,84,103,133,172],"the":[20,47,176],"most":[21],"commonly":[22],"used":[23],"approaches":[24],"for":[25,57,147],"this":[26,35,58,148,151],"still":[28],"leverage":[29,75],"brute-force":[31,177],"approach.":[32,178],"In":[33],"practice":[34],"can":[36,113],"be":[37],"computationally":[38],"costly":[39],"and":[40,107],"overly":[41],"time-consuming,":[42],"due":[43],"part":[45],"to":[46,65],"sheer":[48],"size":[49],"modern":[51],"chemical":[52,95,105,124,135,188],"databases.":[53],"Previous":[54],"computational":[55],"advancements":[56],"have":[60],"generally":[61],"relied":[62],"on":[63,122,154,165,187],"improvements":[64],"hardware":[66],"or":[67],"dataset-specific":[68],"tricks":[69],"that":[70,74,182],"lack":[71],"generalizability.":[72],"Approaches":[73],"lower-complexity":[76],"algorithms":[78,86],"remain":[79],"relatively":[80],"underexplored.":[81],"However,":[82],"many":[83],"these":[85],"are":[87],"approximate":[88],"solutions":[89],"and/or":[90],"struggle":[91],"typical":[93],"high-dimensional":[94],"embeddings.":[96],"Here":[97],"we":[98],"evaluate":[99],"whether":[100],"combination":[102],"low-dimensional":[104],"embeddings":[106,136],"k-d":[109],"tree":[110],"data":[111],"structure":[112],"achieve":[114],"fast":[115],"nearest":[116],"neighbor":[117],"queries":[118],"while":[119],"maintaining":[120],"performance":[121,186],"standard":[123,134],"search":[126],"benchmarks.":[127,190],"We":[128,179],"examine":[129],"different":[130],"dimensionality":[131],"reductions":[132],"as":[137,139],"well":[138],"learned,":[141],"structurally-aware":[142],"embedding":[143],"--":[144,146],"SmallSA":[145,183],"task.":[149],"With":[150],"framework,":[152],"searches":[153],"over":[155],"one":[156],"billion":[157],"chemicals":[158],"execute":[159],"less":[161],"than":[162,175],"second":[164],"single":[167],"CPU":[168],"core,":[169],"five":[170],"orders":[171],"magnitude":[173],"faster":[174],"also":[180],"demonstrate":[181],"achieves":[184],"competitive":[185]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2024-02-15T00:00:00"}
