{"id":"https://openalex.org/W4416035265","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.1696","title":"KRETA: A Benchmark for Korean Reading and Reasoning in Text-Rich VQA Attuned to Diverse Visual Contexts","display_name":"KRETA: A Benchmark for Korean Reading and Reasoning in Text-Rich VQA Attuned to Diverse Visual Contexts","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416035265","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.1696"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.1696","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.1696","pdf_url":"https://aclanthology.org/2025.emnlp-main.1696.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.1696.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5078799563","display_name":"Taebaek Hwang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Taebaek Hwang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100671473","display_name":"Minseo Kim","orcid":"https://orcid.org/0000-0002-7522-184X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Minseo Kim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026592022","display_name":"Gisang Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gisang Lee","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052678547","display_name":"S.H. Kim","orcid":"https://orcid.org/0009-0008-1275-1756"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seonuk Kim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066086175","display_name":"Hyunjun Eun","orcid":"https://orcid.org/0000-0001-7794-5377"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hyunjun Eun","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5078799563"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18287683,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"33409","last_page":"33420"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.34290000796318054,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.34290000796318054,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.2635999917984009,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.06589999794960022,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reading","display_name":"Reading (process)","score":0.5468999743461609},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4699999988079071},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.3278999924659729},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.27869999408721924}],"concepts":[{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.5468999743461609},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5314000248908997},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4699999988079071},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45680001378059387},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.447299987077713},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4251999855041504},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3732999861240387},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.3278999924659729},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.1696","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.1696","pdf_url":"https://aclanthology.org/2025.emnlp-main.1696.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.1696","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.1696","pdf_url":"https://aclanthology.org/2025.emnlp-main.1696.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416035265.pdf","grobid_xml":"https://content.openalex.org/works/W4416035265.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Understanding":[0],"and":[1,19,33,62,75,94,106,127,146,165,231,241,264,274,305,317,326,356,401,432],"reasoning":[2,95,215,233,423],"over":[3,216],"text":[4,92,201,256],"within":[5,219],"visual":[6,83,91,193,240,364],"contexts":[7],"poses":[8],"a":[9,42,70,100,112,128,351,375,421],"significant":[10],"challenge":[11],"for":[12,37,46,72,119,140,167,254,307,344,353],"Vision-Language":[13,174],"Models":[14,175],"(VLMs),":[15],"given":[16],"the":[17,53,151,195,340,392,427],"complexity":[18],"diversity":[20],"of":[21,55,89,153,378,429],"real-world":[22,379],"scenarios.To":[23],"address":[24,295],"this":[25,65,296],"challenge,":[26],"text-rich":[27,120,346],"Visual":[28],"Question":[29],"Answering":[30],"(VQA)":[31],"datasets":[32,312],"benchmarks":[34,57,155,205,284],"have":[35,292],"emerged":[36],"high-resource":[38],"languages":[39,48,247],"like":[40],"English.However,":[41],"critical":[43],"gap":[44,343],"persists":[45],"low-resource":[47,246],"such":[49],"as":[50],"Korean,":[51,141],"where":[52],"lack":[54,250],"comprehensive":[56,262],"hinders":[58],"robust":[59],"model":[60,266],"evaluation":[61,88,131,263,342],"comparison.To":[63],"bridge":[64],"gap,":[66],"we":[67,110,142,348,383,418],"introduce":[68,111],"KRETA,":[69,350],"benchmark":[71,251,352],"Korean":[73,249,310,345,354,380,393],"Reading":[74,355],"rEasoning":[76,357],"in":[77,156,173,244,332,358,407],"Text-rich":[78,359],"VQA":[79,114,283,311,360],"Attuned":[80,361],"to":[81,133,189,294,301,362,391],"diverse":[82,192,269,363],"contexts.KRETA":[84],"facilitates":[85],"an":[86],"in-depth":[87],"both":[90],"understanding":[93],"capabilities,":[96],"while":[97],"also":[98],"supporting":[99],"multifaceted":[101],"assessment":[102],"across":[103,268],"15":[104,387],"domains":[105,270,388],"26":[107,402],"image":[108,125,275,403],"types.Additionally,":[109],"semiautomated":[113],"generation":[115],"pipeline":[116,148],"specifically":[117],"optimized":[118],"settings,":[121],"leveraging":[122,228],"refined":[123],"stepwise":[124],"decomposition":[126],"rigorous":[129],"seven-metric":[130],"protocol":[132],"ensure":[134],"data":[135],"quality.While":[136],"KRETA":[137,168,371],"is":[138,372],"tailored":[139],"hope":[143],"our":[144],"adaptable":[145],"extensible":[147],"will":[149],"facilitate":[150],"development":[152,267],"similar":[154],"other":[157],"languages,":[158],"thereby":[159],"accelerating":[160],"multilingual":[161,282],"VLM":[162],"research.The":[163],"code":[164],"dataset":[166],"are":[169,330],"available":[170],"at":[171],"https://github.com/tabtoyou/KRETA.advances":[172],"(VLMs)":[176],"(Liu":[177],"et":[178,181,185,207,210,286,289,314,411,415],"al.,":[179,182,186,208,211,287,290,315,412,416],"2023a;Wang":[180],"2024;":[183],"Zhang":[184],"2024b)":[187],"designed":[188],"handle":[190],"these":[191,221],"contexts.Recently,":[194],"field":[196],"has":[197],"progressed":[198],"beyond":[199],"basic":[200,255],"recognition,":[202,257],"with":[203],"new":[204],"(Yue":[206,410],"2024c;Hao":[209],"2025)":[212,319],"emphasizing":[213],"higher-order":[214],"textual":[217],"content":[218],"images.Addressing":[220],"challenges":[222],"necessitates":[223],"tightly":[224],"integrated":[225],"cross-modal":[226],"understanding,":[227],"domain":[229],"knowledge":[230],"multi-step":[232],"that":[234],"cannot":[235],"be":[236],"achieved":[237],"by":[238,389,426],"treating":[239],"linguistic":[242],"elements":[243],"isolation.However,":[245],"including":[248],"suites":[252],"even":[253],"much":[258],"less":[259],"reasoning,":[260],"impeding":[261],"hindering":[265],"(e.g.,":[271,277,334],"commerce,":[272],"education)":[273],"types":[276,404],"street":[278],"signs,":[279],"charts).Although":[280],"recent":[281],"(Tang":[285],"2024b;Sun":[288],"2024)":[291,400],"begun":[293],"disparity,":[297],"they":[298],"often":[299,320],"struggle":[300],"provide":[302],"sufficient":[303],"coverage":[304],"depth":[306],"all":[308],"languages.Existing":[309],"(Ju":[313],"2024;Kim":[316],"Jung,":[318],"rely":[321],"on":[322],"translated":[323],"English":[324],"questions":[325],"non-Korean":[327],"images,":[328],"or":[329],"limited":[331],"scale":[333],"fewer":[335],"than":[336],"650":[337],"samples).To":[338],"fill":[339],"underexplored":[341],"VQA,":[347],"propose":[349],"contexts.Specifically,":[365],"Figure":[366],"1":[367,431],"(a)":[368],"shows":[369],"how":[370],"built":[373],"upon":[374],"wide":[376],"range":[377],"imagery,":[381],"which":[382],"systematically":[384],"categorized":[385],"into":[386],"referring":[390],"Standard":[394],"Industrial":[395],"Classification":[396],"(KSIC)":[397],"(Statistics":[398],"Korea,":[399],"widely":[405],"used":[406],"prior":[408],"works":[409],"2024a;":[413],"Tang":[414],"2024b).Furthermore,":[417],"carefully":[419],"design":[420],"dual-level":[422],"framework":[424],"inspired":[425],"concepts":[428],"System":[430,433]},"counts_by_year":[],"updated_date":"2026-03-12T06:13:28.667946","created_date":"2025-11-08T00:00:00"}
