{"id":"https://openalex.org/W7137827436","doi":"https://doi.org/10.48550/arxiv.2603.13238","title":"KazakhOCR: A Synthetic Benchmark for Evaluating Multimodal Models in Low-Resource Kazakh Script OCR","display_name":"KazakhOCR: A Synthetic Benchmark for Evaluating Multimodal Models in Low-Resource Kazakh Script OCR","publication_year":2026,"publication_date":"2026-02-17","ids":{"openalex":"https://openalex.org/W7137827436","doi":"https://doi.org/10.48550/arxiv.2603.13238"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.13238","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13238","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.13238","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119386550","display_name":"Henry Gagnier","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gagnier, Henry","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129721839","display_name":"Sophie Gagnier","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gagnier, Sophie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129713769","display_name":"Ashwin Kirubakaran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kirubakaran, Ashwin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5119386550"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9707000255584717,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9707000255584717,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.003599999938160181,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.0034000000450760126,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scripting-language","display_name":"Scripting language","score":0.7580999732017517},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.7182000279426575},{"id":"https://openalex.org/keywords/kazakh","display_name":"Kazakh","score":0.7009999752044678},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.4900999963283539},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.48969998955726624},{"id":"https://openalex.org/keywords/arabic","display_name":"Arabic","score":0.4334999918937683},{"id":"https://openalex.org/keywords/romanization","display_name":"Romanization","score":0.4023999869823456},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.37880000472068787},{"id":"https://openalex.org/keywords/arabic-script","display_name":"Arabic script","score":0.3361999988555908}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8087000250816345},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.7580999732017517},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.73089998960495},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.7182000279426575},{"id":"https://openalex.org/C2781297163","wikidata":"https://www.wikidata.org/wiki/Q9252","display_name":"Kazakh","level":2,"score":0.7009999752044678},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6581000089645386},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.4900999963283539},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.48969998955726624},{"id":"https://openalex.org/C96455323","wikidata":"https://www.wikidata.org/wiki/Q13955","display_name":"Arabic","level":2,"score":0.4334999918937683},{"id":"https://openalex.org/C106930687","wikidata":"https://www.wikidata.org/wiki/Q976327","display_name":"Romanization","level":2,"score":0.4023999869823456},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3901999890804291},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.37880000472068787},{"id":"https://openalex.org/C2777323237","wikidata":"https://www.wikidata.org/wiki/Q1828555","display_name":"Arabic script","level":3,"score":0.3361999988555908},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.3325999975204468},{"id":"https://openalex.org/C32717103","wikidata":"https://www.wikidata.org/wiki/Q184759","display_name":"Character encoding","level":3,"score":0.3034000098705292},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.302700012922287},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3025999963283539},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2989000082015991},{"id":"https://openalex.org/C2781095461","wikidata":"https://www.wikidata.org/wiki/Q42222","display_name":"Pinyin","level":3,"score":0.29829999804496765},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2976999878883362},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.2922999858856201},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.28929999470710754},{"id":"https://openalex.org/C500551929","wikidata":"https://www.wikidata.org/wiki/Q8819","display_name":"Unicode","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C2987247673","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Character recognition","level":3,"score":0.26589998602867126},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.25049999356269836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.13238","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13238","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.13238","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13238","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7060372829437256,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Kazakh":[0,27,110],"is":[1,29],"a":[2,47,78,124],"Turkic":[3],"language":[4,74,86],"using":[5],"the":[6,40,81,106,161],"Arabic,":[7,115],"Cyrillic,":[8],"and":[9,32,42,61,85,90,98,102,117,128,159,166,171],"Latin":[10,43,97],"scripts,":[11],"making":[12],"it":[13,113],"unique":[14],"in":[15,150],"terms":[16],"of":[17,51,80],"optical":[18],"character":[19,136],"recognition":[20],"(OCR).":[21],"Work":[22],"on":[23,77],"OCR":[24,34,49,67,84,126,133],"for":[25,39,54,83,163],"low-resource":[26,156,169],"scripts":[28,57,158,170],"very":[30],"scarce,":[31],"no":[33],"benchmarks":[35,167],"or":[36],"images":[37,53],"exist":[38],"Arabic":[41,99,107],"scripts.":[44],"We":[45,69,119],"construct":[46],"synthetic":[48],"dataset":[50],"7,219":[52],"all":[55],"three":[56,71],"with":[58,96,123],"font,":[59],"color,":[60],"noise":[62],"variations":[63],"to":[64,104,141,154],"imitate":[65],"real":[66],"tasks.":[68],"evaluated":[70],"multimodal":[72],"large":[73],"models":[75,93,165],"(MLLMs)":[76],"subset":[79],"benchmark":[82],"identification:":[87],"Gemma-3-12B-it,":[88],"Qwen2.5-VL-7B-Instruct,":[89],"Llama-3.2-11B-Vision-Instruct.":[91],"All":[92],"are":[94],"unsuccessful":[95],"script":[100,108],"OCR,":[101],"fail":[103,140],"recognize":[105],"as":[109,114],"text,":[111],"misclassifying":[112],"Farsi,":[116],"Kurdish.":[118],"further":[120],"compare":[121],"MLLMs":[122,139],"classical":[125],"baseline":[127],"find":[129],"that":[130],"while":[131],"traditional":[132],"has":[134],"lower":[135],"error":[137],"rates,":[138],"match":[142],"this":[143],"performance.":[144],"These":[145],"findings":[146],"show":[147],"significant":[148],"gaps":[149],"current":[151],"MLLM":[152],"capabilities":[153],"process":[155],"Abjad-based":[157],"demonstrate":[160],"need":[162],"inclusive":[164],"supporting":[168],"languages.":[172]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
