{"id":"https://openalex.org/W4415538363","doi":"https://doi.org/10.1145/3746027.3754585","title":"OCR-Critic: Aligning Multimodal Large Language Models' Perception through Critical Feedback","display_name":"OCR-Critic: Aligning Multimodal Large Language Models' Perception through Critical Feedback","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415538363","doi":"https://doi.org/10.1145/3746027.3754585"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3754585","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754585","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111230256","display_name":"Qiuna Tan","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qiuna Tan","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0003-1496-1304","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113394951","display_name":"Runqi Qiao","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Runqi Qiao","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-4508-201X","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008320336","display_name":"Guanting Dong","orcid":"https://orcid.org/0000-0002-2318-0281"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guanting Dong","raw_affiliation_strings":["Renmin University of China, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-2318-0281","affiliations":[{"raw_affiliation_string":"Renmin University of China, Beijing, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"middle","author":{"id":null,"display_name":"YiFan Zhang","orcid":"https://orcid.org/0009-0009-1869-3734"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"YiFan Zhang","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-1869-3734","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103945749","display_name":"Minhui Wu","orcid":null},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minhui Wu","raw_affiliation_strings":["WeChat Vision, Tencent, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-3529-7689","affiliations":[{"raw_affiliation_string":"WeChat Vision, Tencent, Beijing, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038186072","display_name":"Jiapeng Wang","orcid":"https://orcid.org/0000-0002-2060-3488"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiapeng Wang","raw_affiliation_strings":["School of Electronic and Information Engineering, South China University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-2060-3488","affiliations":[{"raw_affiliation_string":"School of Electronic and Information Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044900346","display_name":"Miaoxuan Zhang","orcid":"https://orcid.org/0000-0002-9185-3353"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Miaoxuan Zhang","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-9185-3353","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108781573","display_name":"Yida Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yida Xu","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0004-2058-8531","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113792681","display_name":"Chong Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chong Sun","raw_affiliation_strings":["WeChat Vision, Tencent, Shenzhen, China"],"raw_orcid":"https://orcid.org/0009-0009-5365-6250","affiliations":[{"raw_affiliation_string":"WeChat Vision, Tencent, Shenzhen, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100369821","display_name":"Chen Li","orcid":"https://orcid.org/0000-0002-2450-8525"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chen Li","raw_affiliation_strings":["WeChat Vision, Tencent, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-2450-8525","affiliations":[{"raw_affiliation_string":"WeChat Vision, Tencent, Beijing, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100626771","display_name":"Honggang Zhang","orcid":"https://orcid.org/0000-0001-8287-6783"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Honggang Zhang","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-8287-6783","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5111230256"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15601665,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"5385","last_page":"5393"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6152999997138977},{"id":"https://openalex.org/keywords/categorization","display_name":"Categorization","score":0.6025999784469604},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5819000005722046},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5602999925613403},{"id":"https://openalex.org/keywords/error-detection-and-correction","display_name":"Error detection and correction","score":0.5309000015258789},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.48190000653266907},{"id":"https://openalex.org/keywords/strengths-and-weaknesses","display_name":"Strengths and weaknesses","score":0.4311000108718872}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8051000237464905},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6152999997138977},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.6025999784469604},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5819000005722046},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5641999840736389},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5602999925613403},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.5309000015258789},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4869999885559082},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.48190000653266907},{"id":"https://openalex.org/C63882131","wikidata":"https://www.wikidata.org/wiki/Q17122954","display_name":"Strengths and weaknesses","level":2,"score":0.4311000108718872},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42239999771118164},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.4203000068664551},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.33070001006126404},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32899999618530273},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3278000056743622},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3100000023841858},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.28540000319480896},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.27889999747276306},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2705000042915344},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C3018824978","wikidata":"https://www.wikidata.org/wiki/Q2894891","display_name":"Error analysis","level":2,"score":0.26589998602867126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3754585","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754585","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W2810028092","https://openalex.org/W2978036638","https://openalex.org/W2979382951","https://openalex.org/W2998621280","https://openalex.org/W3003642782","https://openalex.org/W3034514377","https://openalex.org/W4405399726","https://openalex.org/W4405595839"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,10,16,70],"Large":[3],"Multimodal":[4],"Models":[5],"have":[6],"demonstrated":[7],"impressive":[8],"performance":[9,137],"various":[11],"tasks.":[12],"However,":[13],"their":[14],"capabilities":[15],"error":[17,44,76,104],"detection":[18],"and":[19,95,106,146],"resolution":[20],"for":[21,41],"Optical":[22],"Character":[23],"Recognition":[24],"(OCR)":[25],"remain":[26],"underexplored.":[27],"To":[28,78],"address":[29],"this":[30,48],"gap,":[31],"we":[32,50,83],"construct":[33],"the":[34,125,132],"first":[35],"visual":[36],"instruction":[37],"tuning":[38],"dataset":[39],"specifically":[40],"detailed":[42],"OCR":[43,71,97,119],"analysis.":[45],"Building":[46],"on":[47,139],"foundation,":[49],"develop":[51],"a":[52,86],"universal,":[53],"plug-and-play":[54],"OCR-Critic":[55,115],"model":[56],"that":[57,114],"incorporates":[58],"three":[59],"novel":[60],"dynamic":[61,129],"alignment":[62,130],"strategies.":[63],"These":[64],"strategies":[65],"systematically":[66],"mitigate":[67],"LMMs'":[68,91],"weaknesses":[69],"tasks":[72],"by":[73],"providing":[74],"coarse-to-fine":[75],"feedback.":[77],"comprehensively":[79],"evaluate":[80],"these":[81],"capabilities,":[82],"introduce":[84],"OCR-ERROR,":[85],"benchmark":[87],"designed":[88],"to":[89,93],"assess":[90],"ability":[92],"detect":[94],"categorize":[96],"errors,":[98],"covering":[99],"two":[100],"task":[101],"types,":[102],"diverse":[103],"categories,":[105],"2,400":[107],"rigorously":[108],"validated":[109],"samples.":[110],"Experimental":[111],"results":[112],"show":[113],"effectively":[116],"identifies":[117],"fine-grained":[118],"errors":[120],"across":[121],"multiple":[122],"domains.":[123],"With":[124],"integration":[126],"of":[127],"our":[128],"strategies,":[131],"LMM":[133],"further":[134],"achieves":[135],"substantial":[136],"gains":[138],"four":[140],"prominent":[141],"benchmarks,":[142],"demonstrating":[143],"both":[144],"versatility":[145],"effectiveness.":[147]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-25T00:00:00"}
