{"id":"https://openalex.org/W4221047533","doi":"https://doi.org/10.1021/acs.jcim.1c01198","title":"PDFDataExtractor: A Tool for Reading Scientific Text and Interpreting Metadata from the Typeset Literature in the Portable Document Format","display_name":"PDFDataExtractor: A Tool for Reading Scientific Text and Interpreting Metadata from the Typeset Literature in the Portable Document Format","publication_year":2022,"publication_date":"2022-03-29","ids":{"openalex":"https://openalex.org/W4221047533","doi":"https://doi.org/10.1021/acs.jcim.1c01198","pmid":"https://pubmed.ncbi.nlm.nih.gov/35349259"},"language":"en","primary_location":{"id":"doi:10.1021/acs.jcim.1c01198","is_oa":true,"landing_page_url":"https://doi.org/10.1021/acs.jcim.1c01198","pdf_url":"https://pubs.acs.org/doi/pdf/10.1021/acs.jcim.1c01198","source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://pubs.acs.org/doi/pdf/10.1021/acs.jcim.1c01198","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100528451","display_name":"Miao Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Miao Zhu","raw_affiliation_strings":["Cavendish Laboratory, Department of Physics, University of Cambridge, J. J. Thomson Avenue, Cambridge CB3 0HE, U.K"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Cavendish Laboratory, Department of Physics, University of Cambridge, J. J. Thomson Avenue, Cambridge CB3 0HE, U.K","institution_ids":["https://openalex.org/I241749"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068607578","display_name":"Jacqueline M. Cole","orcid":"https://orcid.org/0000-0002-1552-8743"},"institutions":[{"id":"https://openalex.org/I1286704778","display_name":"Rutherford Appleton Laboratory","ror":"https://ror.org/03gq8fr08","country_code":"GB","type":"facility","lineage":["https://openalex.org/I1286704778","https://openalex.org/I162524378","https://openalex.org/I4210087105"]},{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Jacqueline M. Cole","raw_affiliation_strings":["Cavendish Laboratory, Department of Physics, University of Cambridge, J. J. Thomson Avenue, Cambridge CB3 0HE, U.K","Department of Chemical Engineering and Biotechnology, University of Cambridge, West Cambridge Site, Philippa Fawcett Drive, Cambridge CB3 0AS, U.K","ISIS Neutron and Muon Source, STFC Rutherford Appleton Laboratory, Harwell Science and Innovation Campus, Didcot, Oxfordshire OX11 0QX, U.K"],"raw_orcid":"https://orcid.org/0000-0002-1552-8743","affiliations":[{"raw_affiliation_string":"Cavendish Laboratory, Department of Physics, University of Cambridge, J. J. Thomson Avenue, Cambridge CB3 0HE, U.K","institution_ids":["https://openalex.org/I241749"]},{"raw_affiliation_string":"Department of Chemical Engineering and Biotechnology, University of Cambridge, West Cambridge Site, Philippa Fawcett Drive, Cambridge CB3 0AS, U.K","institution_ids":["https://openalex.org/I241749"]},{"raw_affiliation_string":"ISIS Neutron and Muon Source, STFC Rutherford Appleton Laboratory, Harwell Science and Innovation Campus, Didcot, Oxfordshire OX11 0QX, U.K","institution_ids":["https://openalex.org/I1286704778"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5068607578"],"corresponding_institution_ids":["https://openalex.org/I1286704778","https://openalex.org/I241749"],"apc_list":null,"apc_paid":null,"fwci":3.9867,"has_fulltext":true,"cited_by_count":49,"citation_normalized_percentile":{"value":0.95089533,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":100},"biblio":{"volume":"62","issue":"7","first_page":"1633","last_page":"1643"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9955000281333923,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9955000281333923,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8205559849739075},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.7914571762084961},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6496200561523438},{"id":"https://openalex.org/keywords/xml","display_name":"XML","score":0.553459107875824},{"id":"https://openalex.org/keywords/file-format","display_name":"File format","score":0.5235918760299683},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5014047622680664},{"id":"https://openalex.org/keywords/metadata-repository","display_name":"Metadata repository","score":0.43913114070892334},{"id":"https://openalex.org/keywords/data-element","display_name":"Data element","score":0.4378302991390228},{"id":"https://openalex.org/keywords/reading","display_name":"Reading (process)","score":0.4168930947780609},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.41670671105384827},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.41150689125061035},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.3813368082046509},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.3517695665359497},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.1713082492351532}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8205559849739075},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.7914571762084961},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6496200561523438},{"id":"https://openalex.org/C8797682","wikidata":"https://www.wikidata.org/wiki/Q2115","display_name":"XML","level":2,"score":0.553459107875824},{"id":"https://openalex.org/C97250363","wikidata":"https://www.wikidata.org/wiki/Q235557","display_name":"File format","level":2,"score":0.5235918760299683},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5014047622680664},{"id":"https://openalex.org/C153048206","wikidata":"https://www.wikidata.org/wiki/Q3454922","display_name":"Metadata repository","level":3,"score":0.43913114070892334},{"id":"https://openalex.org/C30872290","wikidata":"https://www.wikidata.org/wiki/Q1172389","display_name":"Data element","level":3,"score":0.4378302991390228},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.4168930947780609},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.41670671105384827},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.41150689125061035},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.3813368082046509},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3517695665359497},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.1713082492351532},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0}],"mesh":[{"descriptor_ui":"D000071253","descriptor_name":"Metadata","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000071253","descriptor_name":"Metadata","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000071253","descriptor_name":"Metadata","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D009323","descriptor_name":"Natural Language Processing","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D009323","descriptor_name":"Natural Language Processing","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D009323","descriptor_name":"Natural Language Processing","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D011932","descriptor_name":"Reading","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D011932","descriptor_name":"Reading","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D011932","descriptor_name":"Reading","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D012984","descriptor_name":"Software","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D012984","descriptor_name":"Software","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D012984","descriptor_name":"Software","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D057225","descriptor_name":"Data Mining","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D057225","descriptor_name":"Data Mining","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D057225","descriptor_name":"Data Mining","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false}],"locations_count":5,"locations":[{"id":"doi:10.1021/acs.jcim.1c01198","is_oa":true,"landing_page_url":"https://doi.org/10.1021/acs.jcim.1c01198","pdf_url":"https://pubs.acs.org/doi/pdf/10.1021/acs.jcim.1c01198","source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},{"id":"pmid:35349259","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/35349259","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of chemical information and modeling","raw_type":null},{"id":"pmh:oai:osti.gov:1981870","is_oa":true,"landing_page_url":"https://www.osti.gov/biblio/1981870","pdf_url":"https://www.osti.gov/servlets/purl/1981870","source":{"id":"https://openalex.org/S4306402487","display_name":"OSTI OAI (U.S. Department of Energy Office of Scientific and Technical Information)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I139351228","host_organization_name":"Office of Scientific and Technical Information","host_organization_lineage":["https://openalex.org/I139351228"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:9049592","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/9049592","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"J Chem Inf Model","raw_type":"Text"},{"id":"pmh:oai:purl.org/net/epubs:work/52322038","is_oa":true,"landing_page_url":"http://purl.org/net/epubs/work/52322038","pdf_url":null,"source":{"id":"https://openalex.org/S4306400600","display_name":"ePubs (Science and Technology Facilities Council, Research Councils UK)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I162524378","host_organization_name":"Science and Technology Facilities Council","host_organization_lineage":["https://openalex.org/I162524378"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1021/acs.jcim.1c01198","is_oa":true,"landing_page_url":"https://doi.org/10.1021/acs.jcim.1c01198","pdf_url":"https://pubs.acs.org/doi/pdf/10.1021/acs.jcim.1c01198","source":{"id":"https://openalex.org/S167262187","display_name":"Journal of Chemical Information and Modeling","issn_l":"1549-9596","issn":["1549-9596","1549-960X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320006","host_organization_name":"American Chemical Society","host_organization_lineage":["https://openalex.org/P4310320006"],"host_organization_lineage_names":["American Chemical Society"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Chemical Information and Modeling","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.8399999737739563,"id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1531024995","display_name":null,"funder_award_id":"RCSRF1819\\7\\10","funder_id":"https://openalex.org/F4320320005","funder_display_name":"Royal Academy of Engineering"}],"funders":[{"id":"https://openalex.org/F4320307790","display_name":"BASF","ror":"https://ror.org/01q8f6705"},{"id":"https://openalex.org/F4320320005","display_name":"Royal Academy of Engineering","ror":"https://ror.org/0526snb40"},{"id":"https://openalex.org/F4320334632","display_name":"Science and Technology Facilities Council","ror":"https://ror.org/057g20z61"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4221047533.pdf","grobid_xml":"https://content.openalex.org/works/W4221047533.grobid-xml"},"referenced_works_count":11,"referenced_works":["https://openalex.org/W791527587","https://openalex.org/W1530730108","https://openalex.org/W1603719052","https://openalex.org/W1655306493","https://openalex.org/W2061598441","https://openalex.org/W2123442489","https://openalex.org/W2166633748","https://openalex.org/W2501572931","https://openalex.org/W2523785361","https://openalex.org/W2610394652","https://openalex.org/W4230882300"],"related_works":["https://openalex.org/W2379265733","https://openalex.org/W2183628870","https://openalex.org/W92613187","https://openalex.org/W2374379029","https://openalex.org/W3016521766","https://openalex.org/W2367786714","https://openalex.org/W3084772717","https://openalex.org/W2170906434","https://openalex.org/W1503116306","https://openalex.org/W2084876625"],"abstract_inverted_index":{"The":[0,163,172],"layout":[1],"of":[2,161,167,189,199,233,272],"portable":[3],"document":[4,248,274],"format":[5],"(PDF)":[6],"files":[7,55,188],"is":[8,38,213],"constant":[9],"to":[10,20,41,57,62,83,90,93,141,156,182,194],"any":[11],"screen,":[12],"and":[13,26,34,78,86,227,253],"the":[14,73,114,132,149,157,186,196,231,273],"metadata":[15,232,270],"therein":[16],"are":[17,31,69,169],"latent,":[18],"compared":[19],"mark-up":[21],"languages":[22],"such":[23,99,112,237],"as":[24,100,113,138,238],"HTML":[25],"XML.":[27],"No":[28],"semantic":[29,180],"tags":[30],"usually":[32],"provided,":[33],"a":[35,139,175,234,257],"PDF":[36,54,127,187,235],"file":[37],"not":[39],"designed":[40],"be":[42,58,84,91,107,183],"edited":[43],"or":[44],"its":[45,154],"data":[46,51,66,80,95],"interpreted":[47],"by":[48,152],"software.":[49],"However,":[50],"held":[52],"in":[53,60,97,192,225],"need":[56,82,89],"extracted":[59,184],"order":[61,193],"comply":[63],"with":[64],"open-source":[65],"requirements":[67],"that":[68],"now":[70],"government-regulated.":[71],"In":[72],"chemical":[74,77,150],"domain,":[75],"related":[76],"property":[79],"also":[81],"found,":[85],"their":[87],"correlations":[88],"exploited":[92],"enable":[94],"science":[96],"areas":[98,271],"data-driven":[101],"materials":[102],"discovery.":[103],"Such":[104],"relationships":[105],"may":[106],"realized":[108],"using":[109],"text-mining":[110],"software":[111],"\"chemistry-aware\"":[115],"natural-language-processing":[116],"tool,":[117,134],"ChemDataExtractor;":[118],"however,":[119],"this":[120,210],"tool":[121],"has":[122],"limited":[123],"data-extraction":[124],"capabilities":[125,160],"from":[126,185],"files.":[128],"This":[129,178],"study":[130],"presents":[131],"PDFDataExtractor":[133,222,262],"which":[135],"can":[136],"act":[137],"plug-in":[140],"ChemDataExtractor.":[142,162],"It":[143],"outperforms":[144],"other":[145,202],"PDF-extraction":[146],"tools":[147,205],"for":[148,266],"literature":[151],"coupling":[153],"functionalities":[155],"chemical-named":[158],"entity-recognition":[159],"intrinsic":[164],"PDF-reading":[165],"abilities":[166],"ChemDataExtractor":[168],"much":[170],"improved.":[171],"system":[173,212],"features":[174],"template-based":[176,211],"architecture.":[177],"enables":[179],"information":[181,224],"scientific":[190],"articles":[191],"reconstruct":[195],"logical":[197],"structure":[198],"articles.":[200],"While":[201],"existing":[203],"PDF-extracting":[204],"focus":[206],"on":[207,216,219],"quantity":[208],"mining,":[209],"more":[214],"focused":[215],"quality":[217],"mining":[218],"different":[220],"layouts.":[221],"outputs":[223],"JSON":[226],"plain":[228],"text,":[229],"including":[230],"file,":[236],"paper":[239],"title,":[240],"authors,":[241],"affiliation,":[242],"email,":[243],"abstract,":[244],"keywords,":[245],"journal,":[246],"year,":[247],"object":[249],"identifier":[250],"(DOI),":[251],"reference,":[252],"issue":[254],"number.":[255],"With":[256],"self-created":[258],"evaluation":[259],"article":[260],"set,":[261],"achieved":[263],"promising":[264],"precision":[265],"all":[267],"key":[268],"assessed":[269],"text.":[275]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":16},{"year":2024,"cited_by_count":16},{"year":2023,"cited_by_count":11},{"year":2022,"cited_by_count":3}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-10T00:00:00"}
