{"id":"https://openalex.org/W4403791842","doi":"https://doi.org/10.1145/3664647.3681271","title":"VL-Reader: Vision and Language Reconstructor is an Effective Scene Text Recognizer","display_name":"VL-Reader: Vision and Language Reconstructor is an Effective Scene Text Recognizer","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403791842","doi":"https://doi.org/10.1145/3664647.3681271"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681271","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681271","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041568693","display_name":"Humen Zhong","orcid":"https://orcid.org/0009-0002-8676-0811"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Humen Zhong","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101406829","display_name":"Zhibo Yang","orcid":"https://orcid.org/0000-0003-2343-7750"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"ZhiBo Yang","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053443918","display_name":"Zhaohai Li","orcid":"https://orcid.org/0000-0002-7704-3231"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaohai Li","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037614932","display_name":"Peng Wang","orcid":"https://orcid.org/0009-0001-8617-1550"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Wang","raw_affiliation_strings":["Alibaba Group, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103392241","display_name":"Jun Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Tang","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069858870","display_name":"Wenqing Cheng","orcid":"https://orcid.org/0009-0000-3452-9170"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenqing Cheng","raw_affiliation_strings":["Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100669593","display_name":"Cong Yao","orcid":"https://orcid.org/0000-0001-6564-4796"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cong Yao","raw_affiliation_strings":["Alibaba Group, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5041568693"],"corresponding_institution_ids":["https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":0.2493,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.54550125,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"4207","last_page":"4216"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.9918000102043152,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9854000210762024,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7459452748298645},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.5245074033737183},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5036527514457703},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.4162719249725342},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4085143208503723},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.36821043491363525}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7459452748298645},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.5245074033737183},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5036527514457703},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.4162719249725342},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4085143208503723},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.36821043491363525}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664647.3681271","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681271","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W654550266","https://openalex.org/W1922126009","https://openalex.org/W1971822075","https://openalex.org/W1981283549","https://openalex.org/W2050995497","https://openalex.org/W2146835493","https://openalex.org/W2152928267","https://openalex.org/W2194187530","https://openalex.org/W2343052201","https://openalex.org/W2525579820","https://openalex.org/W2747329762","https://openalex.org/W2810983211","https://openalex.org/W2875814315","https://openalex.org/W2962790387","https://openalex.org/W2963233387","https://openalex.org/W2963712589","https://openalex.org/W2970910956","https://openalex.org/W2997749585","https://openalex.org/W2997864923","https://openalex.org/W2998382406","https://openalex.org/W3003711889","https://openalex.org/W3003868038","https://openalex.org/W3003921261","https://openalex.org/W3035449864","https://openalex.org/W3035682985","https://openalex.org/W3110267192","https://openalex.org/W3175855397","https://openalex.org/W3177684257","https://openalex.org/W3179897446","https://openalex.org/W3181186176","https://openalex.org/W3202415716","https://openalex.org/W3204479434","https://openalex.org/W3206651063","https://openalex.org/W3214620264","https://openalex.org/W4225562651","https://openalex.org/W4283821822","https://openalex.org/W4312115774","https://openalex.org/W4312843595","https://openalex.org/W4312879041","https://openalex.org/W4313156423","https://openalex.org/W4382202677","https://openalex.org/W4387968234","https://openalex.org/W4390871832","https://openalex.org/W4390872592"],"related_works":["https://openalex.org/W4231937131","https://openalex.org/W3188962172","https://openalex.org/W323219885","https://openalex.org/W2063928587","https://openalex.org/W2772917594","https://openalex.org/W1487966966","https://openalex.org/W4312825515","https://openalex.org/W1589342014","https://openalex.org/W4306742369","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Text":[0],"recognition":[1,88],"is":[2],"an":[3,35,84,183,191,225],"inherent":[4],"integration":[5],"of":[6,38,94,150,194],"vision":[7,53,79,103,218],"and":[8,17,42,54,56,67,80,104,126,143,167,219],"language,":[9,81],"encompassing":[10],"the":[11,18,22,40,50,60,95,99,107,159,172,175,201],"visual":[12,41,125,166],"texture":[13],"in":[14,59,77,98,171],"stroke":[15],"patterns":[16],"semantic":[19,43],"context":[20,142],"among":[21],"character":[23],"sequences.":[24],"Towards":[25],"advanced":[26],"text":[27,87,168,228],"recognition,":[28],"there":[29],"are":[30],"three":[31],"key":[32],"challenges:":[33],"(1)":[34],"encoder":[36],"capable":[37],"representing":[39],"distributions;":[44],"(2)":[45],"a":[46,73,114,132],"decoder":[47],"that":[48,217],"ensures":[49],"alignment":[51],"between":[52,102],"semantics;":[55],"(3)":[57],"consistency":[58,153],"framework":[61],"during":[62],"pre-training,":[63],"if":[64],"it":[65],"exists,":[66],"fine-tuning.":[68,157],"Inspired":[69],"by":[70,203],"masked":[71,140,165,187],"autoencoding,":[72],"successful":[74],"pre-training":[75,155,160],"strategy":[76],"both":[78,164],"we":[82,111,130],"propose":[83],"innovative":[85],"scene":[86,227],"approach,":[89],"named":[90],"VL-Reader.":[91],"The":[92,148,205,214],"novelty":[93],"VL-Reader":[96,151,162],"lies":[97],"pervasive":[100],"interplay":[101],"language":[105,220],"throughout":[106],"entire":[108],"process.":[109],"Concretely,":[110],"first":[112],"introduce":[113],"Masked":[115,133],"Visual-Linguistic":[116,134],"Reconstruction":[117],"(MVLR)":[118],"objective,":[119],"which":[120],"aims":[121],"at":[122],"simultaneously":[123],"modeling":[124],"linguistic":[127],"information.":[128],"Then,":[129],"design":[131],"Decoder":[135],"(MVLD)":[136],"to":[137,156,178],"further":[138],"leverage":[139],"vision-language":[141],"achieve":[144],"bi-modal":[145],"feature":[146],"interaction.":[147],"architecture":[149],"maintains":[152],"from":[154,182],"In":[158],"stage,":[161,174],"reconstructs":[163],"tokens,":[169],"while":[170],"fine-tuning":[173],"network":[176],"degrades":[177],"reconstruct":[179],"all":[180],"characters":[181],"image":[184],"without":[185],"any":[186],"regions.":[188],"VL-reader":[189],"achieves":[190],"average":[192],"accuracy":[193],"97.1%":[195],"on":[196,211],"six":[197],"typical":[198],"datasets,":[199],"surpassing":[200],"SOTA":[202],"1.1%.":[204],"improvement":[206],"was":[207],"even":[208],"more":[209],"significant":[210],"challenging":[212],"datasets.":[213],"results":[215],"demonstrate":[216],"reconstructor":[221],"can":[222],"serve":[223],"as":[224],"effective":[226],"recognizer.":[229]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
