{"id":"https://openalex.org/W4414019658","doi":"https://doi.org/10.3389/frai.2025.1466092","title":"A review on knowledge and information extraction from PDF documents and storage approaches","display_name":"A review on knowledge and information extraction from PDF documents and storage approaches","publication_year":2025,"publication_date":"2025-09-05","ids":{"openalex":"https://openalex.org/W4414019658","doi":"https://doi.org/10.3389/frai.2025.1466092","pmid":"https://pubmed.ncbi.nlm.nih.gov/40979437"},"language":"en","primary_location":{"id":"doi:10.3389/frai.2025.1466092","is_oa":true,"landing_page_url":"https://doi.org/10.3389/frai.2025.1466092","pdf_url":"https://www.frontiersin.org/journals/artificial-intelligence/articles/10.3389/frai.2025.1466092/pdf","source":{"id":"https://openalex.org/S4210197006","display_name":"Frontiers in Artificial Intelligence","issn_l":"2624-8212","issn":["2624-8212"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320527","host_organization_name":"Frontiers Media","host_organization_lineage":["https://openalex.org/P4310320527"],"host_organization_lineage_names":["Frontiers Media"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence","raw_type":"journal-article"},"type":"review","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.frontiersin.org/journals/artificial-intelligence/articles/10.3389/frai.2025.1466092/pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119550483","display_name":"Salvador D. Atagong","orcid":null},"institutions":[{"id":"https://openalex.org/I168685008","display_name":"International Centre of Insect Physiology and Ecology","ror":"https://ror.org/03qegss47","country_code":"KE","type":"facility","lineage":["https://openalex.org/I168685008"]},{"id":"https://openalex.org/I95023434","display_name":"University of KwaZulu-Natal","ror":"https://ror.org/04qzfn040","country_code":"ZA","type":"education","lineage":["https://openalex.org/I95023434"]}],"countries":["KE","ZA"],"is_corresponding":true,"raw_author_name":"Salvador D. Atagong","raw_affiliation_strings":["Data Management and Geo-Information Unit (DMMGU), International Centre of Insect Physiology and Ecology, Nairobi, Kenya","Department of Environmental Science, University of KwaZulu Natal, Durban, South Africa"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Data Management and Geo-Information Unit (DMMGU), International Centre of Insect Physiology and Ecology, Nairobi, Kenya","institution_ids":["https://openalex.org/I168685008"]},{"raw_affiliation_string":"Department of Environmental Science, University of KwaZulu Natal, Durban, South Africa","institution_ids":["https://openalex.org/I95023434"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113109089","display_name":"Henri Tonnang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210122440","display_name":"International Institute of Tropical Agriculture","ror":"https://ror.org/02smred28","country_code":"NG","type":"nonprofit","lineage":["https://openalex.org/I4210122440"]},{"id":"https://openalex.org/I95023434","display_name":"University of KwaZulu-Natal","ror":"https://ror.org/04qzfn040","country_code":"ZA","type":"education","lineage":["https://openalex.org/I95023434"]}],"countries":["NG","ZA"],"is_corresponding":false,"raw_author_name":"Henri Tonnang","raw_affiliation_strings":["Department of Environmental Science, University of KwaZulu Natal, Durban, South Africa","International Institute of Tropical Agriculture (IITA), Ibadan, Nigeria"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Environmental Science, University of KwaZulu Natal, Durban, South Africa","institution_ids":["https://openalex.org/I95023434"]},{"raw_affiliation_string":"International Institute of Tropical Agriculture (IITA), Ibadan, Nigeria","institution_ids":["https://openalex.org/I4210122440"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010376761","display_name":"Kennedy Senagi","orcid":"https://orcid.org/0000-0002-0757-3907"},"institutions":[{"id":"https://openalex.org/I168685008","display_name":"International Centre of Insect Physiology and Ecology","ror":"https://ror.org/03qegss47","country_code":"KE","type":"facility","lineage":["https://openalex.org/I168685008"]}],"countries":["KE"],"is_corresponding":false,"raw_author_name":"Kennedy Senagi","raw_affiliation_strings":["Data Management and Geo-Information Unit (DMMGU), International Centre of Insect Physiology and Ecology, Nairobi, Kenya"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Data Management and Geo-Information Unit (DMMGU), International Centre of Insect Physiology and Ecology, Nairobi, Kenya","institution_ids":["https://openalex.org/I168685008"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026620212","display_name":"Mark Wamalwa","orcid":null},"institutions":[{"id":"https://openalex.org/I168685008","display_name":"International Centre of Insect Physiology and Ecology","ror":"https://ror.org/03qegss47","country_code":"KE","type":"facility","lineage":["https://openalex.org/I168685008"]}],"countries":["KE"],"is_corresponding":false,"raw_author_name":"Mark Wamalwa","raw_affiliation_strings":["Data Management and Geo-Information Unit (DMMGU), International Centre of Insect Physiology and Ecology, Nairobi, Kenya"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Data Management and Geo-Information Unit (DMMGU), International Centre of Insect Physiology and Ecology, Nairobi, Kenya","institution_ids":["https://openalex.org/I168685008"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040849273","display_name":"Komi Agboka","orcid":"https://orcid.org/0000-0002-8287-6203"},"institutions":[{"id":"https://openalex.org/I168685008","display_name":"International Centre of Insect Physiology and Ecology","ror":"https://ror.org/03qegss47","country_code":"KE","type":"facility","lineage":["https://openalex.org/I168685008"]}],"countries":["KE"],"is_corresponding":false,"raw_author_name":"Komi M. Agboka","raw_affiliation_strings":["Data Management and Geo-Information Unit (DMMGU), International Centre of Insect Physiology and Ecology, Nairobi, Kenya"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Data Management and Geo-Information Unit (DMMGU), International Centre of Insect Physiology and Ecology, Nairobi, Kenya","institution_ids":["https://openalex.org/I168685008"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5031257536","display_name":"John Odindi","orcid":"https://orcid.org/0000-0002-4934-1346"},"institutions":[{"id":"https://openalex.org/I95023434","display_name":"University of KwaZulu-Natal","ror":"https://ror.org/04qzfn040","country_code":"ZA","type":"education","lineage":["https://openalex.org/I95023434"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"John Odindi","raw_affiliation_strings":["Department of Environmental Science, University of KwaZulu Natal, Durban, South Africa"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Environmental Science, University of KwaZulu Natal, Durban, South Africa","institution_ids":["https://openalex.org/I95023434"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5119550483"],"corresponding_institution_ids":["https://openalex.org/I168685008","https://openalex.org/I95023434"],"apc_list":{"value":1150,"currency":"USD","value_usd":1150},"apc_paid":{"value":1150,"currency":"USD","value_usd":1150},"fwci":0.6255,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.71452693,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"8","issue":null,"first_page":"1466092","last_page":"1466092"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.25110000371932983,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.25110000371932983,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.11680000275373459,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.09570000320672989,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6436511278152466},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5913257598876953},{"id":"https://openalex.org/keywords/information-storage","display_name":"Information storage","score":0.5286175608634949},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.5091966986656189},{"id":"https://openalex.org/keywords/extraction","display_name":"Extraction (chemistry)","score":0.46589338779449463},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.435380756855011},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.10184299945831299},{"id":"https://openalex.org/keywords/chromatography","display_name":"Chromatography","score":0.0922381579875946}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6436511278152466},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5913257598876953},{"id":"https://openalex.org/C2988424471","wikidata":"https://www.wikidata.org/wiki/Q193395","display_name":"Information storage","level":2,"score":0.5286175608634949},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.5091966986656189},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.46589338779449463},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.435380756855011},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.10184299945831299},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.0922381579875946}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.3389/frai.2025.1466092","is_oa":true,"landing_page_url":"https://doi.org/10.3389/frai.2025.1466092","pdf_url":"https://www.frontiersin.org/journals/artificial-intelligence/articles/10.3389/frai.2025.1466092/pdf","source":{"id":"https://openalex.org/S4210197006","display_name":"Frontiers in Artificial Intelligence","issn_l":"2624-8212","issn":["2624-8212"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320527","host_organization_name":"Frontiers Media","host_organization_lineage":["https://openalex.org/P4310320527"],"host_organization_lineage_names":["Frontiers Media"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence","raw_type":"journal-article"},{"id":"pmid:40979437","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40979437","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in artificial intelligence","raw_type":null},{"id":"pmh:oai:doaj.org/article:d4c75b5b31c64049975e0386f5b43bd4","is_oa":true,"landing_page_url":"https://doaj.org/article/d4c75b5b31c64049975e0386f5b43bd4","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Frontiers in Artificial Intelligence, Vol 8 (2025)","raw_type":"article"},{"id":"pmh:oai:europepmc.org:11256754","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/12447192","pdf_url":null,"source":{"id":"https://openalex.org/S4306400806","display_name":"Europe PMC (PubMed Central)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1303153112","host_organization_name":"European Bioinformatics Institute","host_organization_lineage":["https://openalex.org/I1303153112"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Text"},{"id":"pmh:oai:pubmedcentral.nih.gov:12447192","is_oa":true,"landing_page_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC12447192/","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Front Artif Intell","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.3389/frai.2025.1466092","is_oa":true,"landing_page_url":"https://doi.org/10.3389/frai.2025.1466092","pdf_url":"https://www.frontiersin.org/journals/artificial-intelligence/articles/10.3389/frai.2025.1466092/pdf","source":{"id":"https://openalex.org/S4210197006","display_name":"Frontiers in Artificial Intelligence","issn_l":"2624-8212","issn":["2624-8212"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320527","host_organization_name":"Frontiers Media","host_organization_lineage":["https://openalex.org/P4310320527"],"host_organization_lineage_names":["Frontiers Media"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4414019658.pdf","grobid_xml":"https://content.openalex.org/works/W4414019658.grobid-xml"},"referenced_works_count":54,"referenced_works":["https://openalex.org/W633457721","https://openalex.org/W1907286193","https://openalex.org/W2012932483","https://openalex.org/W2462565033","https://openalex.org/W2612690371","https://openalex.org/W2763728040","https://openalex.org/W2803853169","https://openalex.org/W2896457183","https://openalex.org/W2941078097","https://openalex.org/W2948269756","https://openalex.org/W2962685863","https://openalex.org/W2970819616","https://openalex.org/W2983562253","https://openalex.org/W3003261556","https://openalex.org/W3022903699","https://openalex.org/W3043040641","https://openalex.org/W3049596049","https://openalex.org/W3123829608","https://openalex.org/W3127365350","https://openalex.org/W3137351210","https://openalex.org/W3144293453","https://openalex.org/W3185842191","https://openalex.org/W3201170304","https://openalex.org/W3201869313","https://openalex.org/W3214342214","https://openalex.org/W3217518891","https://openalex.org/W4206025858","https://openalex.org/W4210956777","https://openalex.org/W4221047533","https://openalex.org/W4282984451","https://openalex.org/W4283120767","https://openalex.org/W4287854476","https://openalex.org/W4296910121","https://openalex.org/W4301398567","https://openalex.org/W4312457904","https://openalex.org/W4319730829","https://openalex.org/W4320015776","https://openalex.org/W4320728842","https://openalex.org/W4322718191","https://openalex.org/W4327928485","https://openalex.org/W4384699688","https://openalex.org/W4386836900","https://openalex.org/W4386896980","https://openalex.org/W4389109142","https://openalex.org/W4389977189","https://openalex.org/W4390041933","https://openalex.org/W4390413780","https://openalex.org/W4392658332","https://openalex.org/W4396919411","https://openalex.org/W4399117498","https://openalex.org/W4403422608","https://openalex.org/W4404649228","https://openalex.org/W4404783839","https://openalex.org/W4405754357"],"related_works":["https://openalex.org/W2377297411","https://openalex.org/W3148217948","https://openalex.org/W2375788636","https://openalex.org/W2358561207","https://openalex.org/W2975617233","https://openalex.org/W2388704129","https://openalex.org/W2392827053","https://openalex.org/W2377877252","https://openalex.org/W2362914816","https://openalex.org/W2372644337"],"abstract_inverted_index":{"Introduction:":[0],"Automating":[1],"the":[2,48,53,96,101,157],"extraction":[3,72,165],"of":[4,47,98,103,162],"information":[5,17,71,141,164],"from":[6],"Portable":[7],"Document":[8],"Format":[9],"(PDF)":[10],"documents":[11],"represents":[12],"a":[13,124],"major":[14],"advancement":[15],"in":[16,21,69,115],"extraction,":[18],"with":[19],"applications":[20],"various":[22],"domains":[23],"such":[24,112],"as":[25,113],"healthcare,":[26],"law,":[27],"or":[28],"biochemistry.":[29],"However,":[30],"existing":[31],"solutions":[32],"face":[33],"challenges":[34],"related":[35],"to":[36,64,155],"accuracy,":[37,158],"domain":[38],"adaptability,":[39,159],"and":[40,60,67,73,89,110,149,160],"implementation":[41],"complexity.":[42],"Methods:":[43],"A":[44],"systematic":[45],"review":[46,78],"literature":[49],"was":[50],"conducted":[51],"using":[52],"Preferred":[54],"Reporting":[55],"Items":[56],"for":[57,107],"Systematic":[58],"Reviews":[59],"Meta-Analyses":[61],"(PRISMA)":[62],"methodology":[63],"examine":[65],"approaches":[66],"trends":[68],"PDF":[70,163],"storage":[74],"approaches.":[75,92],"Results:":[76],"The":[77],"revealed":[79],"three":[80],"dominant":[81],"methodological":[82],"categories:":[83],"rule-based":[84,99],"systems,":[85],"statistical":[86],"learning":[87],"models,":[88],"neural":[90],"network-based":[91],"Key":[93],"limitations":[94],"include":[95],"rigidity":[97],"methods,":[100],"lack":[102],"annotated":[104],"domain-specific":[105],"datasets":[106],"learning-based":[108],"approaches,":[109],"issues":[111],"hallucinations":[114],"large":[116],"language":[117],"models.":[118],"Discussion:":[119],"To":[120],"overcome":[121],"these":[122],"limitations,":[123],"conceptual":[125],"framework":[126,153],"is":[127],"proposed":[128],"comprising":[129],"nine":[130],"core":[131],"components:":[132],"project":[133],"manager,":[134,136,140],"document":[135,137],"pre-processor,":[138],"ontology":[139],"extractor,":[142],"annotation":[143],"engine,":[144],"question-answering":[145],"tool,":[146],"knowledge":[147],"visualizer,":[148],"data":[150],"exporter.":[151],"This":[152],"aims":[154],"improve":[156],"usability":[161],"systems.":[166]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
