{"id":"https://openalex.org/W4416016930","doi":"https://doi.org/10.1145/3746252.3761295","title":"Evaluating Robustness of LLMs in Question Answering on Multilingual Noisy OCR Data","display_name":"Evaluating Robustness of LLMs in Question Answering on Multilingual Noisy OCR Data","publication_year":2025,"publication_date":"2025-11-07","ids":{"openalex":"https://openalex.org/W4416016930","doi":"https://doi.org/10.1145/3746252.3761295"},"language":"en","primary_location":{"id":"doi:10.1145/3746252.3761295","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761295","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746252.3761295","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5007529430","display_name":"Bhawna Piryani","orcid":"https://orcid.org/0009-0005-3578-2393"},"institutions":[{"id":"https://openalex.org/I190249584","display_name":"Universit\u00e4t Innsbruck","ror":"https://ror.org/054pv6659","country_code":"AT","type":"education","lineage":["https://openalex.org/I190249584"]}],"countries":["AT"],"is_corresponding":true,"raw_author_name":"Bhawna Piryani","raw_affiliation_strings":["University of Innsbruck, Innsbruck, Austria"],"affiliations":[{"raw_affiliation_string":"University of Innsbruck, Innsbruck, Austria","institution_ids":["https://openalex.org/I190249584"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001145944","display_name":"Jamshid Mozafari","orcid":"https://orcid.org/0000-0003-4850-9239"},"institutions":[{"id":"https://openalex.org/I190249584","display_name":"Universit\u00e4t Innsbruck","ror":"https://ror.org/054pv6659","country_code":"AT","type":"education","lineage":["https://openalex.org/I190249584"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Jamshid Mozafari","raw_affiliation_strings":["University of Innsbruck, Innsbruck, Austria"],"affiliations":[{"raw_affiliation_string":"University of Innsbruck, Innsbruck, Austria","institution_ids":["https://openalex.org/I190249584"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064852810","display_name":"Abdelrahman Abdallah","orcid":"https://orcid.org/0000-0001-8747-4927"},"institutions":[{"id":"https://openalex.org/I190249584","display_name":"Universit\u00e4t Innsbruck","ror":"https://ror.org/054pv6659","country_code":"AT","type":"education","lineage":["https://openalex.org/I190249584"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Abdelrahman Abdallah","raw_affiliation_strings":["University of Innsbruck, Innsbruck, Austria"],"affiliations":[{"raw_affiliation_string":"University of Innsbruck, Innsbruck, Austria","institution_ids":["https://openalex.org/I190249584"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033491986","display_name":"Antoine Doucet","orcid":"https://orcid.org/0000-0001-6160-3356"},"institutions":[{"id":"https://openalex.org/I78744979","display_name":"La Rochelle Universit\u00e9","ror":"https://ror.org/04mv1z119","country_code":"FR","type":"education","lineage":["https://openalex.org/I78744979"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Antoine Doucet","raw_affiliation_strings":["University of La Rochelle, La Rochelle, France and University of Ljubljana, Ljubljana, Slovenia"],"affiliations":[{"raw_affiliation_string":"University of La Rochelle, La Rochelle, France and University of Ljubljana, Ljubljana, Slovenia","institution_ids":["https://openalex.org/I78744979"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5079733597","display_name":"Adam Jatowt","orcid":"https://orcid.org/0000-0001-7235-0665"},"institutions":[{"id":"https://openalex.org/I190249584","display_name":"Universit\u00e4t Innsbruck","ror":"https://ror.org/054pv6659","country_code":"AT","type":"education","lineage":["https://openalex.org/I190249584"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Adam Jatowt","raw_affiliation_strings":["University of Innsbruck, Innsbruck, Austria"],"affiliations":[{"raw_affiliation_string":"University of Innsbruck, Innsbruck, Austria","institution_ids":["https://openalex.org/I190249584"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5007529430"],"corresponding_institution_ids":["https://openalex.org/I190249584"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.33066352,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2366","last_page":"2376"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.550000011920929,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.550000011920929,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.30559998750686646,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.05310000106692314,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7372000217437744},{"id":"https://openalex.org/keywords/digitization","display_name":"Digitization","score":0.707099974155426},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.6840000152587891},{"id":"https://openalex.org/keywords/imperfect","display_name":"Imperfect","score":0.5382999777793884},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.4893999993801117},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4122999906539917},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4113999903202057},{"id":"https://openalex.org/keywords/named-entity-recognition","display_name":"Named-entity recognition","score":0.3813000023365021}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8256000280380249},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7372000217437744},{"id":"https://openalex.org/C2779308522","wikidata":"https://www.wikidata.org/wiki/Q843958","display_name":"Digitization","level":2,"score":0.707099974155426},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.6840000152587891},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6416000127792358},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5823000073432922},{"id":"https://openalex.org/C2780310539","wikidata":"https://www.wikidata.org/wiki/Q12547192","display_name":"Imperfect","level":2,"score":0.5382999777793884},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.4893999993801117},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4122999906539917},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4113999903202057},{"id":"https://openalex.org/C2779135771","wikidata":"https://www.wikidata.org/wiki/Q403574","display_name":"Named-entity recognition","level":3,"score":0.3813000023365021},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.37929999828338623},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3564000129699707},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C3018824978","wikidata":"https://www.wikidata.org/wiki/Q2894891","display_name":"Error analysis","level":2,"score":0.3425999879837036},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.3418999910354614},{"id":"https://openalex.org/C2781170535","wikidata":"https://www.wikidata.org/wiki/Q30587856","display_name":"Noisy data","level":2,"score":0.3379000127315521},{"id":"https://openalex.org/C67905146","wikidata":"https://www.wikidata.org/wiki/Q5287646","display_name":"Document processing","level":2,"score":0.31060001254081726},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.30149999260902405},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2935999929904938},{"id":"https://openalex.org/C2987247673","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Character recognition","level":3,"score":0.2930999994277954},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2793999910354614},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.25940001010894775}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3746252.3761295","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761295","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},{"id":"pmh:oai:d.cobiss.net/repository/si:272786691","is_oa":true,"landing_page_url":"https://plus.cobiss.net/cobiss/si/en/bib/272786691","pdf_url":null,"source":{"id":"https://openalex.org/S7407055177","display_name":"dCOBISS.SI Digital Repository","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"COBISS-ID: 272764675","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:repozitorij.uni-lj.si:IzpisGradiva.php?id=181096","is_oa":true,"landing_page_url":"https://repozitorij.uni-lj.si/IzpisGradiva.php?id=181096","pdf_url":"https://repozitorij.uni-lj.si/Dokument.php?lang=slv&id=231270&dn=","source":{"id":"https://openalex.org/S4377196268","display_name":"Repository of the University of Ljubljana (University of Ljubljana)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I153976015","host_organization_name":"University of Ljubljana","host_organization_lineage":["https://openalex.org/I153976015"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"ISBN: 979-8-4007-2040-6","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"doi:10.1145/3746252.3761295","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761295","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W1573345323","https://openalex.org/W2212003014","https://openalex.org/W2786672397","https://openalex.org/W2805411799","https://openalex.org/W2891810409","https://openalex.org/W2900753330","https://openalex.org/W2963748441","https://openalex.org/W2986829274","https://openalex.org/W3035022648","https://openalex.org/W3049158684","https://openalex.org/W3084721699","https://openalex.org/W3099756172","https://openalex.org/W3158086504","https://openalex.org/W3199662997","https://openalex.org/W3215413293","https://openalex.org/W4221050583","https://openalex.org/W4230543443","https://openalex.org/W4318196853","https://openalex.org/W4391331168","https://openalex.org/W4393248081","https://openalex.org/W4409362017"],"related_works":[],"abstract_inverted_index":{"Optical":[0],"Character":[1],"Recognition":[2],"(OCR)":[3],"plays":[4],"a":[5,41,61],"crucial":[6],"role":[7],"in":[8,162],"digitizing":[9],"historical":[10,83,163],"and":[11,26,75,89,128,153],"multilingual":[12,62],"documents,":[13,84],"yet":[14],"OCR":[15,92,113,133],"errors":[16,127],"-":[17],"imperfect":[18],"extraction":[19],"of":[20,44,51,91,150],"text,":[21],"including":[22],"character":[23],"insertion,":[24],"deletion,":[25],"substitution":[27],"can":[28],"significantly":[29],"impact":[30],"downstream":[31],"tasks":[32],"like":[33],"question-answering":[34],"(QA).":[35],"In":[36],"this":[37,57],"work,":[38],"we":[39,59,144],"conduct":[40],"comprehensive":[42],"analysis":[43],"how":[45,97],"OCR-induced":[46,126],"noise":[47],"affects":[48],"the":[49,148,155],"performance":[50,138],"Multilingual":[52],"QA":[53,63,120,160],"Systems.":[54],"To":[55],"support":[56],"analysis,":[58],"introduce":[60],"dataset":[64,78],"MultiOCR-QA,":[65],"comprising":[66],"50K":[67],"question-answer":[68],"pairs":[69],"across":[70],"three":[71,111],"languages,":[72],"English,":[73],"French,":[74],"German.":[76],"The":[77],"is":[79],"curated":[80],"from":[81],"OCR-ed":[82],"which":[85],"include":[86],"different":[87,98,106],"levels":[88],"types":[90],"noise.":[93],"We":[94],"then":[95],"evaluate":[96],"state-of-the-art":[99],"Large":[100],"Language":[101],"Models":[102],"(LLMs)":[103],"perform":[104,129],"under":[105],"error":[107,114],"conditions,":[108],"focusing":[109],"on":[110,131,139],"major":[112],"types.":[115],"Our":[116],"findings":[117],"show":[118],"that":[119],"systems":[121,161],"are":[122],"highly":[123],"prone":[124],"to":[125],"poorly":[130],"noisy":[132,142],"text.":[134],"By":[135],"comparing":[136],"model":[137],"clean":[140],"versus":[141],"texts,":[143],"provide":[145],"insights":[146],"into":[147],"limitations":[149],"current":[151],"approaches":[152],"emphasize":[154],"need":[156],"for":[157],"more":[158],"noise-resilient":[159],"digitization":[164],"contexts.":[165]},"counts_by_year":[],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-11-08T00:00:00"}
