{"id":"https://openalex.org/W4417485408","doi":"https://doi.org/10.48550/arxiv.2507.15085","title":"OCRGenBench: A Comprehensive Benchmark for Evaluating OCR Generative Capabilities","display_name":"OCRGenBench: A Comprehensive Benchmark for Evaluating OCR Generative Capabilities","publication_year":2025,"publication_date":"2025-07-20","ids":{"openalex":"https://openalex.org/W4417485408","doi":"https://doi.org/10.48550/arxiv.2507.15085"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2507.15085","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2507.15085","pdf_url":"https://arxiv.org/pdf/2507.15085","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2507.15085","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014739409","display_name":"Peirong Zhang","orcid":"https://orcid.org/0000-0001-5597-9752"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Peirong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104309972","display_name":"Haowei Xu","orcid":"https://orcid.org/0009-0001-2919-9944"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Haowei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108155384","display_name":"Jiaxin Zhang","orcid":"https://orcid.org/0009-0004-7652-8191"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jiaxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5105412121","display_name":"G. F. Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Xuhan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012066483","display_name":"Xuhan Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Guitao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032090227","display_name":"Zhenhua Yang","orcid":"https://orcid.org/0000-0002-3967-6249"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yuyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101725852","display_name":"Junle Liu","orcid":"https://orcid.org/0000-0002-4204-9107"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Junle","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053133067","display_name":"Yu-Yi Zhang","orcid":"https://orcid.org/0000-0001-5554-3756"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhenhua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080674767","display_name":"Lianwen Jin","orcid":"https://orcid.org/0000-0002-5456-0957"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Jin, Lianwen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Lianwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5014739409"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.4138999879360199,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.4138999879360199,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.23880000412464142,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.0575999990105629,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.8471999764442444},{"id":"https://openalex.org/keywords/categorization","display_name":"Categorization","score":0.6118999719619751},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5953999757766724},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5108000040054321},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5062000155448914},{"id":"https://openalex.org/keywords/generative-design","display_name":"Generative Design","score":0.4902999997138977},{"id":"https://openalex.org/keywords/natural","display_name":"Natural (archaeology)","score":0.41510000824928284}],"concepts":[{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.8471999764442444},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6944000124931335},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.6118999719619751},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5953999757766724},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5218999981880188},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5108000040054321},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5062000155448914},{"id":"https://openalex.org/C184408114","wikidata":"https://www.wikidata.org/wiki/Q1502022","display_name":"Generative Design","level":3,"score":0.4902999997138977},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.41510000824928284},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.39959999918937683},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3790999948978424},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.3619999885559082},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.334199994802475},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.33250001072883606},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.30399999022483826},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3003000020980835},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.27869999408721924},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.2619999945163727}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2507.15085","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2507.15085","pdf_url":"https://arxiv.org/pdf/2507.15085","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2507.15085","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.15085","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2507.15085","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2507.15085","pdf_url":"https://arxiv.org/pdf/2507.15085","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Improving":[0],"visual":[1,81,224],"text":[2,26,41,70,82,107,117,143,166,195,225],"synthesis":[3,83],"has":[4],"long":[5],"been":[6],"a":[7,78,162,211],"challenging":[8,56],"and":[9,42,52,72,109,119,127,151,170,200,229],"evolving":[10],"frontier":[11],"for":[12,99],"image":[13],"generation":[14,27,48,146],"models.":[15],"While":[16],"recent":[17],"state-of-the-art":[18],"(SOTA)":[19],"models":[20,179],"have":[21],"made":[22],"remarkable":[23],"strides":[24],"in":[25],"capabilities,":[28,218],"existing":[29],"benchmarks":[30],"inadequately":[31],"assess":[32],"their":[33],"true":[34],"performance":[35],"due":[36],"to":[37,76,97,154,214],"narrow":[38],"scope":[39],"(scene":[40],"posters":[43],"only),":[44],"isolated":[45],"evaluation":[46,230],"(T2I":[47],"or":[49,204],"editing":[50],"separately),":[51],"insufficient":[53],"difficulty":[54],"(lacking":[55],"scenarios).":[57],"To":[58],"bridge":[59],"this":[60],"gap,":[61],"we":[62,90,159],"pioneer":[63],"the":[64,93,220],"unification":[65],"of":[66,137,222],"text-centric":[67],"T2I":[68,115],"generation,":[69,116],"editing,":[71,118],"OCR-related":[73],"image-to-image":[74,121],"translation":[75],"evaluate":[77,215],"model's":[79],"holistic":[80],"abilities,":[84],"i.e.,":[85],"OCR":[86,111,122,216],"generative":[87,112,178,217],"capabilities.":[88],"Accordingly,":[89],"propose":[91],"OCRGenBench,":[92],"most":[94,182],"comprehensive":[95],"benchmark":[96,131,228],"date":[98],"evaluating":[100],"these":[101],"abilities.":[102],"OCRGenBench":[103,209],"covers":[104],"five":[105],"common":[106],"categories":[108],"33":[110],"tasks,":[113],"encompassing":[114],"other":[120],"tasks":[123],"(e.g.,":[124],"document":[125],"dewarping":[126],"handwriting":[128],"removal).":[129],"The":[130,227],"includes":[132],"1,060":[133],"human-annotated":[134],"samples":[135],"consisting":[136],"instruction-image-GT":[138],"triplets,":[139],"deliberately":[140],"featuring":[141],"high":[142],"density,":[144],"diverse":[145],"scales,":[147],"varied":[148],"aspect":[149],"ratios,":[150],"bilingual":[152],"content":[153,198],"capture":[155],"real-world":[156],"complexity.":[157],"Furthermore,":[158],"introduce":[160],"OCRGenScore,":[161],"unified":[163],"metric":[164],"integrating":[165],"accuracy,":[167],"aesthetic":[168],"quality,":[169],"instruction":[171],"following.":[172],"Extensive":[173],"experiments":[174],"on":[175],"19":[176],"cutting-edge":[177],"reveal":[180],"that":[181],"score":[183],"below":[184],"60/100.":[185],"Our":[186],"analysis":[187],"exposes":[188],"critical,":[189],"previously":[190],"overlooked":[191],"limitations,":[192],"including":[193],"poor":[194],"localization,":[196],"unintended":[197],"modifications,":[199],"failures":[201],"with":[202],"dense":[203],"small-scale":[205],"text.":[206],"We":[207],"hope":[208],"establishes":[210],"robust":[212],"standard":[213],"driving":[219],"evolution":[221],"reliable":[223],"synthesis.":[226],"code":[231],"are":[232],"available":[233],"at":[234],"https://github.com/NiceRingNode/Awesome-Generative-Models-for-OCR.":[235]},"counts_by_year":[],"updated_date":"2026-06-05T09:01:59.212387","created_date":"2025-10-10T00:00:00"}
