{"id":"https://openalex.org/W7147133687","doi":"https://doi.org/10.48550/arxiv.2603.27942","title":"JaWildText: A Benchmark for Vision-Language Models on Japanese Scene Text Understanding","display_name":"JaWildText: A Benchmark for Vision-Language Models on Japanese Scene Text Understanding","publication_year":2026,"publication_date":"2026-03-30","ids":{"openalex":"https://openalex.org/W7147133687","doi":"https://doi.org/10.48550/arxiv.2603.27942"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.27942","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27942","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.27942","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132694529","display_name":"Koki Maeda","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Maeda, Koki","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5107642698","display_name":"Naoaki Okazaki","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Okazaki, Naoaki","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5458999872207642,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5458999872207642,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.34869998693466187,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.014800000004470348,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/handwriting","display_name":"Handwriting","score":0.7190999984741211},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6043000221252441},{"id":"https://openalex.org/keywords/bigram","display_name":"Bigram","score":0.5250999927520752},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4991999864578247},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.4749999940395355},{"id":"https://openalex.org/keywords/receipt","display_name":"Receipt","score":0.4058000147342682},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.33889999985694885}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7986000180244446},{"id":"https://openalex.org/C2779386606","wikidata":"https://www.wikidata.org/wiki/Q2393642","display_name":"Handwriting","level":2,"score":0.7190999984741211},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6452000141143799},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6043000221252441},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5781000256538391},{"id":"https://openalex.org/C108757681","wikidata":"https://www.wikidata.org/wiki/Q2773912","display_name":"Bigram","level":3,"score":0.5250999927520752},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4991999864578247},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.4749999940395355},{"id":"https://openalex.org/C2778979077","wikidata":"https://www.wikidata.org/wiki/Q330190","display_name":"Receipt","level":2,"score":0.4058000147342682},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3614000082015991},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.33889999985694885},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.30169999599456787},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2874000072479248},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.2815000116825104},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C112640561","wikidata":"https://www.wikidata.org/wiki/Q2440634","display_name":"Handwriting recognition","level":3,"score":0.2696000039577484},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.267300009727478},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2632000148296356},{"id":"https://openalex.org/C138268822","wikidata":"https://www.wikidata.org/wiki/Q1051925","display_name":"Resolution (logic)","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.25609999895095825}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.27942","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27942","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.27942","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27942","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.8256085515022278}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Japanese":[0,29,47,78,202],"scene":[1,59,79,203],"text":[2,49,60,80,134,204],"poses":[3],"challenges":[4],"that":[5,108,171],"multilingual":[6,34],"benchmarks":[7],"often":[8],"fail":[9],"to":[10],"capture,":[11],"including":[12],"mixed":[13],"scripts,":[14],"frequent":[15],"vertical":[16],"writing,":[17],"and":[18,115,150,161,169,206],"a":[19,69],"character":[20,101],"inventory":[21],"far":[22],"larger":[23],"than":[24],"the":[25,42,172,182,190],"Latin":[26],"alphabet.":[27],"Although":[28],"is":[30],"included":[31],"in":[32,91,110],"several":[33],"benchmarks,":[35],"these":[36],"resources":[37],"do":[38],"not":[39],"adequately":[40],"capture":[41],"language-specific":[43],"complexities.":[44],"Meanwhile,":[45],"existing":[46],"visual":[48,111,133],"datasets":[50],"have":[51],"primarily":[52],"focused":[53],"on":[54,77],"scanned":[55],"documents,":[56],"leaving":[57],"in-the-wild":[58],"underexplored.":[61],"To":[62],"fill":[63],"this":[64],"gap,":[65],"we":[66],"introduce":[67],"JaWildText,":[68],"diagnostic":[70],"benchmark":[71],"for":[72,194],"evaluating":[73],"vision-language":[74],"models":[75],"(VLMs)":[76],"understanding.":[81],"JaWildText":[82,196],"contains":[83],"3,241":[84],"instances":[85],"from":[86,147],"2,961":[87],"images":[88],"newly":[89],"captured":[90],"Japan,":[92],"with":[93,210],"1.12":[94],"million":[95],"annotated":[96],"characters":[97],"spanning":[98],"3,643":[99],"unique":[100],"types.":[102],"It":[103],"comprises":[104],"three":[105,183],"complementary":[106],"tasks":[107],"vary":[109],"organization,":[112],"output":[113],"format,":[114],"writing":[116,162],"style:":[117],"(i)":[118],"Dense":[119],"Scene":[120],"Text":[121],"Visual":[122],"Question":[123],"Answering":[124],"(STVQA),":[125],"which":[126,142,154],"requires":[127],"reasoning":[128],"over":[129],"multiple":[130],"pieces":[131],"of":[132,179,201],"evidence;":[135],"(ii)":[136],"Receipt":[137],"Key":[138],"Information":[139],"Extraction":[140],"(KIE),":[141],"tests":[143],"layout-aware":[144],"structured":[145],"extraction":[146],"mobile-captured":[148],"receipts;":[149],"(iii)":[151],"Handwriting":[152],"OCR,":[153],"evaluates":[155],"page-level":[156],"transcription":[157],"across":[158,181],"various":[159],"media":[160],"directions.":[163],"We":[164],"evaluate":[165],"14":[166],"open-weight":[167],"VLMs":[168],"find":[170],"best":[173],"model":[174],"achieves":[175],"an":[176],"average":[177],"score":[178],"0.64":[180],"tasks.":[184],"Error":[185],"analyses":[186],"show":[187],"recognition":[188],"remains":[189],"dominant":[191],"bottleneck,":[192],"especially":[193],"kanji.":[195],"enables":[197],"fine-grained,":[198],"script-aware":[199],"diagnosis":[200],"capabilities,":[205],"will":[207],"be":[208],"released":[209],"evaluation":[211],"code.":[212]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-02T00:00:00"}
