{"id":"https://openalex.org/W4304142414","doi":"https://doi.org/10.1145/3503161.3547977","title":"From Token to Word: OCR Token Evolution via Contrastive Learning and Semantic Matching for Text-VQA","display_name":"From Token to Word: OCR Token Evolution via Contrastive Learning and Semantic Matching for Text-VQA","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4304142414","doi":"https://doi.org/10.1145/3503161.3547977"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3547977","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547977","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101481925","display_name":"Zanxia Jin","orcid":"https://orcid.org/0000-0002-7990-5544"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zan-Xia Jin","raw_affiliation_strings":["University of Science and Technology Beijing, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068937750","display_name":"Mike Zheng Shou","orcid":"https://orcid.org/0000-0002-7681-2166"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Mike Zheng Shou","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101984155","display_name":"Fang Zhou","orcid":"https://orcid.org/0000-0001-5478-7898"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fang Zhou","raw_affiliation_strings":["University of Science and Technology Beijing, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115501011","display_name":"Satoshi Tsutsui","orcid":"https://orcid.org/0000-0001-8003-2616"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Satoshi Tsutsui","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072850755","display_name":"Jingyan Qin","orcid":"https://orcid.org/0000-0002-4101-4316"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingyan Qin","raw_affiliation_strings":["University of Science and Technology Beijing, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5074514262","display_name":"Xu-Cheng Yin","orcid":"https://orcid.org/0000-0003-0023-0220"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xu-Cheng Yin","raw_affiliation_strings":["University of Science and Technology Beijing, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101481925"],"corresponding_institution_ids":["https://openalex.org/I92403157"],"apc_list":null,"apc_paid":null,"fwci":0.531,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.75189891,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4564","last_page":"4572"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9923999905586243,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.850758969783783},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.8214534521102905},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.7133276462554932},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6972458362579346},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.6920795440673828},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.5094202160835266},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5083746314048767},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.47753918170928955},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.4769529104232788},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.44978535175323486},{"id":"https://openalex.org/keywords/levenshtein-distance","display_name":"Levenshtein distance","score":0.42238783836364746},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.42003169655799866},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.10863268375396729},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.10391610860824585}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.850758969783783},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.8214534521102905},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7133276462554932},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6972458362579346},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.6920795440673828},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.5094202160835266},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5083746314048767},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.47753918170928955},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.4769529104232788},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.44978535175323486},{"id":"https://openalex.org/C2777515626","wikidata":"https://www.wikidata.org/wiki/Q496939","display_name":"Levenshtein distance","level":2,"score":0.42238783836364746},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.42003169655799866},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.10863268375396729},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.10391610860824585},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3503161.3547977","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547977","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.699999988079071,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G2035329896","display_name":null,"funder_award_id":"202006460059","funder_id":"https://openalex.org/F4320322725","funder_display_name":"China Scholarship Council"},{"id":"https://openalex.org/G3968842904","display_name":null,"funder_award_id":"NRF-NRFF13-2021-0008","funder_id":"https://openalex.org/F4320320709","funder_display_name":"National Research Foundation Singapore"},{"id":"https://openalex.org/G453106366","display_name":null,"funder_award_id":"62076024, 62006018","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8772010899","display_name":null,"funder_award_id":"62125601","funder_id":"https://openalex.org/F4320336125","funder_display_name":"National Science Fund for Distinguished Young Scholars"}],"funders":[{"id":"https://openalex.org/F4320320709","display_name":"National Research Foundation Singapore","ror":"https://ror.org/03cpyc314"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322725","display_name":"China Scholarship Council","ror":"https://ror.org/04atp4p48"},{"id":"https://openalex.org/F4320336125","display_name":"National Science Fund for Distinguished Young Scholars","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1933349210","https://openalex.org/W2053317383","https://openalex.org/W2493916176","https://openalex.org/W2745461083","https://openalex.org/W2809273748","https://openalex.org/W2810983211","https://openalex.org/W2875814315","https://openalex.org/W2884093133","https://openalex.org/W2897054934","https://openalex.org/W2922794508","https://openalex.org/W2979382951","https://openalex.org/W2988326850","https://openalex.org/W3004082545","https://openalex.org/W3007556011","https://openalex.org/W3031222954","https://openalex.org/W3034336960","https://openalex.org/W3034390599","https://openalex.org/W3034943799","https://openalex.org/W3041222599","https://openalex.org/W3093385053","https://openalex.org/W3094071022","https://openalex.org/W3095771422","https://openalex.org/W3096719817","https://openalex.org/W3108319047","https://openalex.org/W3108614371","https://openalex.org/W3112075608","https://openalex.org/W3115287481","https://openalex.org/W3179897446","https://openalex.org/W3181159501","https://openalex.org/W3184364189","https://openalex.org/W3205050305","https://openalex.org/W3206082179","https://openalex.org/W3215633354"],"related_works":["https://openalex.org/W2384605597","https://openalex.org/W2387743295","https://openalex.org/W2970530566","https://openalex.org/W2967478618","https://openalex.org/W2997152889","https://openalex.org/W4385572700","https://openalex.org/W3124932927","https://openalex.org/W2397605424","https://openalex.org/W4297786995","https://openalex.org/W4382863885"],"abstract_inverted_index":{"Text-based":[0],"Visual":[1],"Question":[2],"Answering":[3],"(Text-VQA)":[4],"is":[5,16,144],"a":[6,59,124,141,159],"question-answering":[7],"task":[8],"to":[9,63,89,148,170],"understand":[10],"scene":[11],"text,":[12],"where":[13],"the":[14,26,48,75,80,85,90,114,118,130,150,168,172,176,193],"text":[15,27],"usually":[17],"recognized":[18,39],"by":[19,109,127],"Optical":[20],"Character":[21],"Recognition":[22],"(OCR)":[23],"systems.":[24,53],"However,":[25],"from":[28,175],"OCR":[29,43,65,68,77,91,99,111,119,136,181],"systems":[30],"often":[31],"includes":[32],"spelling":[33],"errors,":[34],"such":[35],"as":[36,40],"\"pepsi\"":[37],"being":[38],"\"peosi\".":[41],"These":[42],"errors":[44,66],"are":[45,138],"one":[46],"of":[47,132,185],"major":[49],"challenges":[50],"for":[51],"Text-VQA":[52,61],"To":[54,93],"address":[55],"this,":[56],"we":[57,72,96,157],"propose":[58,97],"novel":[60],"method":[62,191],"alleviate":[64],"via":[67,113],"token":[69],"evolution.":[70],"First,":[71],"artificially":[73],"create":[74],"misspelled":[76,135,180],"tokens":[78,112,120,137],"in":[79,123,134],"training":[81],"time,":[82],"and":[83,121,146,199],"make":[84],"system":[86],"more":[87],"robust":[88],"errors.":[92],"be":[94,205],"specific,":[95],"an":[98],"Token-Word":[100],"Contrastive":[101],"(TWC)":[102],"learning":[103],"task,":[104],"which":[105,166],"pre-trains":[106],"word":[107,154,174],"representation":[108],"augmenting":[110],"Levenshtein":[115],"distance":[116],"between":[117],"words":[122],"dictionary.":[125],"Second,":[126],"assuming":[128],"that":[129,189],"majority":[131],"characters":[133],"still":[139],"correct,":[140],"multimodal":[142],"transformer":[143],"proposed":[145],"fine-tuned":[147],"predict":[149],"answer":[151],"using":[152],"character-based":[153],"embedding.":[155],"Specifically,":[156],"introduce":[158],"vocabulary":[160,177],"predictor":[161],"with":[162,179],"character-level":[163],"semantic":[164],"matching,":[165],"enables":[167],"model":[169],"recover":[171],"correct":[173],"even":[178],"tokens.":[182],"A":[183],"variety":[184],"experimental":[186],"evaluations":[187],"show":[188],"our":[190],"outperforms":[192],"state-of-the-art":[194],"methods":[195],"on":[196],"both":[197],"TextVQA":[198],"ST-VQA":[200],"datasets.":[201],"The":[202],"code":[203],"will":[204],"released":[206],"at":[207],"https://github.com/xiaojino/TWA.":[208]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":1}],"updated_date":"2026-05-30T09:04:40.226872","created_date":"2025-10-10T00:00:00"}
