{"id":"https://openalex.org/W4400177400","doi":"https://doi.org/10.1021/acs.jcim.4c00572","title":"OpenChemIE: An Information Extraction Toolkit for Chemistry Literature","display_name":"OpenChemIE: An Information Extraction Toolkit for Chemistry Literature","publication_year":2024,"publication_date":"2024-07-01","ids":{"openalex":"https://openalex.org/W4400177400","doi":"https://doi.org/10.1021/acs.jcim.4c00572","pmid":"https://pubmed.ncbi.nlm.nih.gov/38950894"},"language":"en","primary_location":{"id":"doi:10.1021/acs.jcim.4c00572","is_oa":false,"landing_page_url":"https://doi.org/10.1021/acs.jcim.4c00572","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5094579757","display_name":"Vincent Fan","orcid":"https://orcid.org/0009-0002-8774-5015"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Vincent Fan","raw_affiliation_strings":["Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States"],"affiliations":[{"raw_affiliation_string":"Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101546902","display_name":"Yujie Qian","orcid":"https://orcid.org/0000-0003-3747-4552"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yujie Qian","raw_affiliation_strings":["Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States"],"affiliations":[{"raw_affiliation_string":"Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103982993","display_name":"Alex Wang","orcid":"https://orcid.org/0000-0003-0353-2937"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alex Wang","raw_affiliation_strings":["Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States"],"affiliations":[{"raw_affiliation_string":"Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049219989","display_name":"Amber Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Amber Wang","raw_affiliation_strings":["Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States"],"affiliations":[{"raw_affiliation_string":"Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076162644","display_name":"Connor W. Coley","orcid":"https://orcid.org/0000-0002-8271-8723"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Connor W. Coley","raw_affiliation_strings":["Department of Chemical Engineering, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States"],"affiliations":[{"raw_affiliation_string":"Department of Chemical Engineering, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","institution_ids":["https://openalex.org/I63966007"]},{"raw_affiliation_string":"Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5010124873","display_name":"Regina Barzilay","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Regina Barzilay","raw_affiliation_strings":["Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States"],"affiliations":[{"raw_affiliation_string":"Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","institution_ids":["https://openalex.org/I63966007"]},{"raw_affiliation_string":"Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, United States","institution_ids":["https://openalex.org/I63966007"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5010124873","https://openalex.org/A5094579757","https://openalex.org/A5101546902"],"corresponding_institution_ids":["https://openalex.org/I63966007"],"apc_list":null,"apc_paid":null,"fwci":6.3636,"has_fulltext":false,"cited_by_count":18,"citation_normalized_percentile":{"value":0.96982432,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"64","issue":"14","first_page":"5521","last_page":"5534"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9934999942779541,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.565165638923645},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.47326210141181946},{"id":"https://openalex.org/keywords/extraction","display_name":"Extraction (chemistry)","score":0.4576725959777832},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.42194512486457825},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3588147759437561},{"id":"https://openalex.org/keywords/chromatography","display_name":"Chromatography","score":0.2533959746360779}],"concepts":[{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.565165638923645},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.47326210141181946},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.4576725959777832},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.42194512486457825},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3588147759437561},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.2533959746360779}],"mesh":[{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000069550","descriptor_name":"Machine Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000080911","descriptor_name":"Cheminformatics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D000080911","descriptor_name":"Cheminformatics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D000080911","descriptor_name":"Cheminformatics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D057225","descriptor_name":"Data Mining","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D057225","descriptor_name":"Data Mining","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D057225","descriptor_name":"Data Mining","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D062126","descriptor_name":"Databases, Chemical","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D062126","descriptor_name":"Databases, Chemical","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D062126","descriptor_name":"Databases, Chemical","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false}],"locations_count":2,"locations":[{"id":"doi:10.1021/acs.jcim.4c00572","is_oa":false,"landing_page_url":"https://doi.org/10.1021/acs.jcim.4c00572","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},{"id":"pmid:38950894","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/38950894","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of chemical information and modeling","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.550000011920929}],"awards":[{"id":"https://openalex.org/G2082912093","display_name":null,"funder_award_id":"1918839","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G2200512414","display_name":null,"funder_award_id":"CCF-2112665","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G2835580799","display_name":null,"funder_award_id":"2134795","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G496264345","display_name":null,"funder_award_id":"HR00111920025","funder_id":"https://openalex.org/F4320332180","funder_display_name":"Defense Advanced Research Projects Agency"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320332180","display_name":"Defense Advanced Research Projects Agency","ror":"https://ror.org/02caytj08"},{"id":"https://openalex.org/F4320332186","display_name":"Defense Threat Reduction Agency","ror":"https://ror.org/04tz64554"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W1966456689","https://openalex.org/W1971538385","https://openalex.org/W2111044246","https://openalex.org/W2145870108","https://openalex.org/W2165671627","https://openalex.org/W2523785361","https://openalex.org/W2901942917","https://openalex.org/W2911489562","https://openalex.org/W2963734039","https://openalex.org/W3007750971","https://openalex.org/W3013318641","https://openalex.org/W3015725310","https://openalex.org/W3092396754","https://openalex.org/W3094332970","https://openalex.org/W3096609285","https://openalex.org/W3103383152","https://openalex.org/W3159789740","https://openalex.org/W3168661259","https://openalex.org/W3201277766","https://openalex.org/W3202466114","https://openalex.org/W3203895579","https://openalex.org/W4221047533","https://openalex.org/W4223436720","https://openalex.org/W4226262621","https://openalex.org/W4281484410","https://openalex.org/W4311366447","https://openalex.org/W4353112191","https://openalex.org/W4382198765","https://openalex.org/W4386001223","https://openalex.org/W4386244642","https://openalex.org/W4386884238","https://openalex.org/W4389523661","https://openalex.org/W4392468778"],"related_works":["https://openalex.org/W4387497383","https://openalex.org/W2948807893","https://openalex.org/W2778153218","https://openalex.org/W2748952813","https://openalex.org/W1531601525","https://openalex.org/W4391375266","https://openalex.org/W2078814861","https://openalex.org/W2527526854","https://openalex.org/W1976181487","https://openalex.org/W1986764834"],"abstract_inverted_index":{"Information":[0],"extraction":[1,16,50,126,176,200],"from":[2,33,68,108,117,131],"chemistry":[3,99,203],"literature":[4],"is":[5,195],"vital":[6],"for":[7,12,98,124,198],"constructing":[8],"up-to-date":[9],"reaction":[10,52,129,132,156,175],"databases":[11],"data-driven":[13],"chemistry.":[14],"Complete":[15],"requires":[17],"combining":[18],"information":[19,67,100,116,199],"across":[20],"text,":[21],"tables,":[22],"and":[23,47,71,134,148,217],"figures,":[24],"whereas":[25],"prior":[26],"work":[27],"has":[28],"mainly":[29],"investigated":[30],"extracting":[31,65],"reactions":[32,107],"single":[34],"modalities.":[35],"In":[36],"this":[37,44],"paper,":[38],"we":[39,87,149],"present":[40],"OpenChemIE":[41,58,179,194,227],"to":[42,76,160,229],"address":[43,94],"complex":[45],"challenge":[46],"enable":[48],"the":[49,55,60,74,84,115,125,174,190,230],"of":[51,81,127,155,171,178,184],"data":[53,130],"at":[54],"document":[56],"level.":[57],"approaches":[59],"problem":[61],"in":[62,215],"two":[63],"steps:":[64],"relevant":[66],"individual":[69],"modalities":[70],"then":[72,113],"integrating":[73],"results":[75,177],"obtain":[77],"a":[78,95,152,165,222,240],"final":[79],"list":[80],"reactions.":[82],"For":[83],"first":[85],"step,":[86],"employ":[88],"specialized":[89],"neural":[90],"models":[91,141],"that":[92],"each":[93],"specific":[96],"task":[97],"extraction,":[101],"such":[102],"as":[103,164,210,232,236,238],"parsing":[104],"molecules":[105,206],"or":[106,110,213],"text":[109,216],"figures.":[111],"We":[112,225],"integrate":[114],"these":[118],"modules":[119],"using":[120],"chemistry-informed":[121],"algorithms,":[122],"allowing":[123],"fine-grained":[128],"condition":[133],"substrate":[135],"scope":[136],"investigations.":[137],"Our":[138],"machine":[139],"learning":[140],"attain":[142,180],"state-of-the-art":[143],"performance":[144],"when":[145,186],"evaluated":[146],"individually,":[147],"meticulously":[150],"annotate":[151],"challenging":[153],"dataset":[154],"schemes":[157],"with":[158],"R-groups":[159],"evaluate":[161],"our":[162],"pipeline":[163],"whole,":[166],"achieving":[167],"an":[168,181,233],"F1":[169],"score":[170,183],"69.5%.":[172],"Additionally,":[173],"accuracy":[182],"64.3%":[185],"directly":[187],"compared":[188],"against":[189],"Reaxys":[191],"chemical":[192],"database.":[193],"most":[196],"suited":[197],"on":[201],"organic":[202],"literature,":[204],"where":[205],"are":[207],"generally":[208],"depicted":[209],"planar":[211],"graphs":[212],"written":[214],"can":[218],"be":[219],"consolidated":[220],"into":[221],"SMILES":[223],"format.":[224],"provide":[226],"freely":[228],"public":[231],"open-source":[234],"package,":[235],"well":[237],"through":[239],"web":[241],"interface.":[242]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":13},{"year":2024,"cited_by_count":2}],"updated_date":"2026-03-22T08:09:32.410652","created_date":"2025-10-10T00:00:00"}
