{"id":"https://openalex.org/W2523785361","doi":"https://doi.org/10.1021/acs.jcim.6b00207","title":"ChemDataExtractor: A Toolkit for Automated Extraction of Chemical Information from the Scientific Literature","display_name":"ChemDataExtractor: A Toolkit for Automated Extraction of Chemical Information from the Scientific Literature","publication_year":2016,"publication_date":"2016-09-26","ids":{"openalex":"https://openalex.org/W2523785361","doi":"https://doi.org/10.1021/acs.jcim.6b00207","mag":"2523785361","pmid":"https://pubmed.ncbi.nlm.nih.gov/27669338"},"language":"en","primary_location":{"id":"doi:10.1021/acs.jcim.6b00207","is_oa":false,"landing_page_url":"https://doi.org/10.1021/acs.jcim.6b00207","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","datacite","pubmed"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.17863/cam.10935","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030876876","display_name":"Matthew C. Swain","orcid":"https://orcid.org/0000-0001-6428-5189"},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]},{"id":"https://openalex.org/I4210096386","display_name":"Bridge University","ror":"https://ror.org/00cbm0437","country_code":"SS","type":"education","lineage":["https://openalex.org/I4210096386"]}],"countries":["GB","SS"],"is_corresponding":true,"raw_author_name":"Matthew C. Swain","raw_affiliation_strings":["Cavendish Laboratory, University of Cambridge, J. J. Thomson Avenue, Cambridge, CB3 0HE, U.K"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Cavendish Laboratory, University of Cambridge, J. J. Thomson Avenue, Cambridge, CB3 0HE, U.K","institution_ids":["https://openalex.org/I4210096386","https://openalex.org/I241749"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068607578","display_name":"Jacqueline M. Cole","orcid":"https://orcid.org/0000-0002-1552-8743"},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]},{"id":"https://openalex.org/I4210096386","display_name":"Bridge University","ror":"https://ror.org/00cbm0437","country_code":"SS","type":"education","lineage":["https://openalex.org/I4210096386"]}],"countries":["GB","SS"],"is_corresponding":false,"raw_author_name":"Jacqueline M. Cole","raw_affiliation_strings":["Cavendish Laboratory, University of Cambridge, J. J. Thomson Avenue, Cambridge, CB3 0HE, U.K"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Cavendish Laboratory, University of Cambridge, J. J. Thomson Avenue, Cambridge, CB3 0HE, U.K","institution_ids":["https://openalex.org/I4210096386","https://openalex.org/I241749"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5030876876"],"corresponding_institution_ids":["https://openalex.org/I241749","https://openalex.org/I4210096386"],"apc_list":null,"apc_paid":null,"fwci":10.0958,"has_fulltext":false,"cited_by_count":576,"citation_normalized_percentile":{"value":0.98570255,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"56","issue":"10","first_page":"1894","last_page":"1904"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9878000020980835,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.9541000127792358,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5370297431945801},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.4867527484893799},{"id":"https://openalex.org/keywords/extraction","display_name":"Extraction (chemistry)","score":0.47216886281967163},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.46820905804634094},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.41872429847717285},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.24294328689575195},{"id":"https://openalex.org/keywords/chromatography","display_name":"Chromatography","score":0.17288950085639954}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5370297431945801},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4867527484893799},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.47216886281967163},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.46820905804634094},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.41872429847717285},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.24294328689575195},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.17288950085639954}],"mesh":[{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D009323","descriptor_name":"Natural Language Processing","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D009323","descriptor_name":"Natural Language Processing","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D009323","descriptor_name":"Natural Language Processing","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D010363","descriptor_name":"Pattern Recognition, Automated","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D010363","descriptor_name":"Pattern Recognition, Automated","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D010363","descriptor_name":"Pattern Recognition, Automated","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D010506","descriptor_name":"Periodicals as Topic","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D010506","descriptor_name":"Periodicals as Topic","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D010506","descriptor_name":"Periodicals as Topic","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D016000","descriptor_name":"Cluster Analysis","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D016000","descriptor_name":"Cluster Analysis","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D016000","descriptor_name":"Cluster Analysis","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D057225","descriptor_name":"Data Mining","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D057225","descriptor_name":"Data Mining","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D057225","descriptor_name":"Data Mining","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":false},{"descriptor_ui":"D062126","descriptor_name":"Databases, Chemical","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D062126","descriptor_name":"Databases, Chemical","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D062126","descriptor_name":"Databases, Chemical","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true}],"locations_count":4,"locations":[{"id":"doi:10.1021/acs.jcim.6b00207","is_oa":false,"landing_page_url":"https://doi.org/10.1021/acs.jcim.6b00207","pdf_url":null,"source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},{"id":"pmid:27669338","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/27669338","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of chemical information and modeling","raw_type":null},{"id":"pmh:oai:www.repository.cam.ac.uk:1810/266462","is_oa":false,"landing_page_url":"https://www.repository.cam.ac.uk/handle/1810/266462","pdf_url":null,"source":{"id":"https://openalex.org/S4306401777","display_name":"Apollo (University of Cambridge)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I241749","host_organization_name":"University of Cambridge","host_organization_lineage":["https://openalex.org/I241749"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.17863/cam.10935","is_oa":true,"landing_page_url":"https://doi.org/10.17863/cam.10935","pdf_url":null,"source":{"id":"https://openalex.org/S7407050737","display_name":"Apollo","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"doi:10.17863/cam.10935","is_oa":true,"landing_page_url":"https://doi.org/10.17863/cam.10935","pdf_url":null,"source":{"id":"https://openalex.org/S7407050737","display_name":"Apollo","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article-journal"},"sustainable_development_goals":[{"score":0.7200000286102295,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G2614401929","display_name":"DTA - University of Cambridge","funder_award_id":"EP/J500380/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320320247","display_name":"Royal Commission for the Exhibition of 1851","ror":"https://ror.org/05fdb2817"},{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W192665053","https://openalex.org/W371426616","https://openalex.org/W1966456689","https://openalex.org/W1991154713","https://openalex.org/W1992985800","https://openalex.org/W2018951244","https://openalex.org/W2044990793","https://openalex.org/W2097960255","https://openalex.org/W2099369363","https://openalex.org/W2101553882","https://openalex.org/W2111044246","https://openalex.org/W2121227244","https://openalex.org/W2121244856","https://openalex.org/W2121764873","https://openalex.org/W2133527736","https://openalex.org/W2135505477","https://openalex.org/W2136794542","https://openalex.org/W2145870108","https://openalex.org/W2149369282","https://openalex.org/W2158139315","https://openalex.org/W2160022416","https://openalex.org/W2165671627","https://openalex.org/W2166468803","https://openalex.org/W2169491861","https://openalex.org/W2170484170","https://openalex.org/W2236252949","https://openalex.org/W2250978584","https://openalex.org/W2266664921","https://openalex.org/W2394935510","https://openalex.org/W2531274738"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"The":[0,205],"emergence":[1],"of":[2,23,54,67,124,133,147,185,207,215,222,251],"\"big":[3],"data\"":[4],"initiatives":[5],"has":[6],"led":[7],"to":[8,51,84,171,210,265],"the":[9,28,49,64,122,144,183,203,208,240,259],"need":[10],"for":[11,63,99,116,154,182,227],"tools":[12,254],"that":[13,80,151,177,198],"can":[14,34,81],"automatically":[15],"extract":[16,212],"valuable":[17],"chemical":[18,32,68,87,117,186,194,229,234,242],"information":[19,33,44,140],"from":[20,77,267],"large":[21],"volumes":[22],"unstructured":[24],"data,":[25],"such":[26,159],"as":[27,160],"scientific":[29,78],"literature.":[30],"Since":[31],"be":[35,82],"present":[36,59,143],"in":[37,202],"figures,":[38],"tables,":[39],"and":[40,70,75,106,139,164,175,190,196,225,233,262],"textual":[41,161],"paragraphs,":[42,162],"successful":[43],"extraction":[45,66,244],"often":[46],"depends":[47],"on":[48,129],"ability":[50],"interpret":[52],"all":[53],"these":[55],"domains":[56,158],"simultaneously.":[57],"We":[58,166],"a":[60,130,248],"complete":[61],"toolkit":[62,209],"automated":[65],"entities":[69],"their":[71],"associated":[72],"properties,":[73],"measurements,":[74],"relationships":[76],"documents":[79],"used":[83],"populate":[85],"structured":[86],"databases.":[88],"Our":[89],"system":[90],"provides":[91],"an":[92,220],"extensible,":[93],"chemistry-aware,":[94],"natural":[95],"language":[96],"processing":[97,170],"pipeline":[98],"tokenization,":[100],"part-of-speech":[101],"tagging,":[102],"named":[103,118],"entity":[104,119],"recognition,":[105],"phrase":[107,137],"parsing.":[108],"Within":[109],"this":[110,178],"scope,":[111],"we":[112,142],"report":[113],"improved":[114],"performance":[115,206],"recognition":[120],"through":[121],"use":[123,146],"unsupervised":[125],"word":[126],"clustering":[127],"based":[128],"massive":[131],"corpus":[132],"chemistry":[134],"articles.":[135],"For":[136],"parsing":[138],"extraction,":[141],"novel":[145],"multiple":[148],"rule-based":[149],"grammars":[150],"are":[152,199,263],"tailored":[153],"interpreting":[155],"specific":[156],"document":[157],"captions,":[163],"tables.":[165],"also":[167],"describe":[168],"document-level":[169],"resolve":[172],"data":[173,216],"interdependencies":[174],"show":[176],"is":[179],"particularly":[180],"necessary":[181],"autogeneration":[184],"databases":[187],"since":[188],"captions":[189],"tables":[191],"commonly":[192],"contain":[193],"identifiers":[195],"references":[197],"defined":[200],"elsewhere":[201],"text.":[204],"correctly":[211],"various":[213],"types":[214],"was":[217],"evaluated,":[218],"affording":[219],"F-score":[221,250],"93.4%,":[223],"86.8%,":[224],"91.5%":[226],"extracting":[228],"identifiers,":[230],"spectroscopic":[231],"attributes,":[232,236],"property":[235],"respectively;":[237],"set":[238],"against":[239],"CHEMDNER":[241],"name":[243],"challenge,":[245],"ChemDataExtractor":[246],"yields":[247],"competitive":[249],"87.8%.":[252],"All":[253],"have":[255],"been":[256],"released":[257],"under":[258],"MIT":[260],"license":[261],"available":[264],"download":[266],"http://www.chemdataextractor.org":[268],".":[269]},"counts_by_year":[{"year":2026,"cited_by_count":39},{"year":2025,"cited_by_count":105},{"year":2024,"cited_by_count":84},{"year":2023,"cited_by_count":107},{"year":2022,"cited_by_count":72},{"year":2021,"cited_by_count":70},{"year":2020,"cited_by_count":41},{"year":2019,"cited_by_count":36},{"year":2018,"cited_by_count":14},{"year":2017,"cited_by_count":8}],"updated_date":"2026-05-12T08:28:47.272897","created_date":"2025-10-10T00:00:00"}
