{"id":"https://openalex.org/W4401589459","doi":"https://doi.org/10.1007/978-3-031-65794-8_4","title":"OCR Cleaning of\u00a0Scientific Texts with\u00a0LLMs","display_name":"OCR Cleaning of\u00a0Scientific Texts with\u00a0LLMs","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4401589459","doi":"https://doi.org/10.1007/978-3-031-65794-8_4"},"language":"en","primary_location":{"id":"doi:10.1007/978-3-031-65794-8_4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/978-3-031-65794-8_4","pdf_url":"https://link.springer.com/content/pdf/10.1007/978-3-031-65794-8_4.pdf","source":{"id":"https://openalex.org/S106296714","display_name":"Lecture notes in computer science","issn_l":"0302-9743","issn":["0302-9743","1611-3349"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"book series"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Lecture Notes in Computer Science","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/978-3-031-65794-8_4.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106495150","display_name":"G\u00e1bor Madar\u00e1sz","orcid":null},"institutions":[{"id":"https://openalex.org/I2802350943","display_name":"ELTE Hungarian Research Centre for Linguistics","ror":"https://ror.org/005cqsz63","country_code":"HU","type":"facility","lineage":["https://openalex.org/I2802350943"]}],"countries":["HU"],"is_corresponding":true,"raw_author_name":"G\u00e1bor Madar\u00e1sz","raw_affiliation_strings":["HUN-REN Hungarian Research Centre for Linguistics, Budapest, Hungary"],"affiliations":[{"raw_affiliation_string":"HUN-REN Hungarian Research Centre for Linguistics, Budapest, Hungary","institution_ids":["https://openalex.org/I2802350943"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005415440","display_name":"No\u00e9mi Ligeti-Nagy","orcid":"https://orcid.org/0000-0003-0851-7621"},"institutions":[{"id":"https://openalex.org/I2802350943","display_name":"ELTE Hungarian Research Centre for Linguistics","ror":"https://ror.org/005cqsz63","country_code":"HU","type":"facility","lineage":["https://openalex.org/I2802350943"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"No\u00e9mi Ligeti-Nagy","raw_affiliation_strings":["HUN-REN Hungarian Research Centre for Linguistics, Budapest, Hungary"],"affiliations":[{"raw_affiliation_string":"HUN-REN Hungarian Research Centre for Linguistics, Budapest, Hungary","institution_ids":["https://openalex.org/I2802350943"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037594849","display_name":"Andr\u00e1s Holl","orcid":"https://orcid.org/0000-0002-6873-3425"},"institutions":[{"id":"https://openalex.org/I4210140733","display_name":"Library and Information Centre of the Hungarian Academy of Sciences","ror":"https://ror.org/04ws47v52","country_code":"HU","type":"archive","lineage":["https://openalex.org/I4210140733","https://openalex.org/I7597260"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"Andr\u00e1s Holl","raw_affiliation_strings":["Library and Information Centre, Hungarian Academy of Sciences, Budapest, Hungary"],"affiliations":[{"raw_affiliation_string":"Library and Information Centre, Hungarian Academy of Sciences, Budapest, Hungary","institution_ids":["https://openalex.org/I4210140733"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5106519677","display_name":"Tam\u00e1s V\u00e1radi","orcid":"https://orcid.org/0000-0001-5765-3908"},"institutions":[{"id":"https://openalex.org/I2802350943","display_name":"ELTE Hungarian Research Centre for Linguistics","ror":"https://ror.org/005cqsz63","country_code":"HU","type":"facility","lineage":["https://openalex.org/I2802350943"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"Tam\u00e1s V\u00e1radi","raw_affiliation_strings":["HUN-REN Hungarian Research Centre for Linguistics, Budapest, Hungary"],"affiliations":[{"raw_affiliation_string":"HUN-REN Hungarian Research Centre for Linguistics, Budapest, Hungary","institution_ids":["https://openalex.org/I2802350943"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5106495150"],"corresponding_institution_ids":["https://openalex.org/I2802350943"],"apc_list":{"value":5000,"currency":"EUR","value_usd":5392},"apc_paid":{"value":5000,"currency":"EUR","value_usd":5392},"fwci":0.7794,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.71619146,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"49","last_page":"58"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.9891999959945679,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9243000149726868,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7194685339927673}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7194685339927673}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1007/978-3-031-65794-8_4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/978-3-031-65794-8_4","pdf_url":"https://link.springer.com/content/pdf/10.1007/978-3-031-65794-8_4.pdf","source":{"id":"https://openalex.org/S106296714","display_name":"Lecture notes in computer science","issn_l":"0302-9743","issn":["0302-9743","1611-3349"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"book series"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Lecture Notes in Computer Science","raw_type":"book-chapter"},{"id":"pmh:oai:real.mtak.hu:202543","is_oa":true,"landing_page_url":null,"pdf_url":"https://real.mtak.hu/202543/1/Madarasz_et_al_OCR_Cleaning_with_LLMs.pdf","source":{"id":"https://openalex.org/S4306400081","display_name":"Repository of the Academy's Library (Library of the Hungarian Academy of Sciences)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210140733","host_organization_name":"Library and Information Centre of the Hungarian Academy of Sciences","host_organization_lineage":["https://openalex.org/I4210140733"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Conference or Workshop Item"}],"best_oa_location":{"id":"doi:10.1007/978-3-031-65794-8_4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/978-3-031-65794-8_4","pdf_url":"https://link.springer.com/content/pdf/10.1007/978-3-031-65794-8_4.pdf","source":{"id":"https://openalex.org/S106296714","display_name":"Lecture notes in computer science","issn_l":"0302-9743","issn":["0302-9743","1611-3349"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"book series"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Lecture Notes in Computer Science","raw_type":"book-chapter"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.46000000834465027}],"awards":[],"funders":[{"id":"https://openalex.org/F4320322192","display_name":"Magyar Tudom\u00e1nyos Akad\u00e9mia","ror":"https://ror.org/02ks8qq67"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4401589459.pdf","grobid_xml":"https://content.openalex.org/works/W4401589459.grobid-xml"},"referenced_works_count":4,"referenced_works":["https://openalex.org/W2786672397","https://openalex.org/W2981852735","https://openalex.org/W3212092655","https://openalex.org/W3212511129"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"Abstract":[0],"Correcting":[1],"Optical":[2],"Character":[3],"Recognition":[4],"(OCR)":[5],"errors":[6,71],"is":[7],"a":[8,45,62],"major":[9],"challenge":[10],"in":[11,115],"preprocessing":[12],"datasets":[13],"consisting":[14,65],"of":[15,55,66],"legacy":[16],"PDF":[17],"files.":[18],"In":[19],"this":[20],"study,":[21],"we":[22],"develop":[23],"Large":[24],"Language":[25],"Models":[26],"specially":[27],"finetuned":[28],"to":[29,89],"correct":[30],"OCR":[31,58,70,90,107],"errors.":[32,59],"We":[33,60],"experimented":[34],"with":[35,57,69,93,109],"the":[36,40,53,82,99],"mT5":[37,83,100],"model":[38,84,101],"(both":[39],"mT5-small":[41],"and":[42],"mT5-large":[43],"configurations),":[44],"Text-to-Text":[46],"Transfer":[47],"Transformer-based":[48],"machine":[49],"translation":[50],"model,":[51],"for":[52,106,111],"post-correction":[54],"texts":[56],"compiled":[61],"parallel":[63],"corpus":[64],"text":[67],"corrupted":[68],"as":[72,74,102],"well":[73],"corresponding":[75],"clean":[76],"data.":[77],"Our":[78],"findings":[79],"suggest":[80],"that":[81],"can":[85],"be":[86],"successfully":[87],"applied":[88],"error":[91],"correction":[92],"improving":[94],"accuracy.":[95],"The":[96],"results":[97],"affirm":[98],"an":[103],"effective":[104],"tool":[105],"post-correction,":[108],"prospects":[110],"achieving":[112],"greater":[113],"efficiency":[114],"future":[116],"research.":[117]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
