{"id":"https://openalex.org/W4402982652","doi":"https://doi.org/10.1109/icme57554.2024.10688200","title":"Adversarial Training with OCR modality Perturbation for Scene-Text Visual Question Answering","display_name":"Adversarial Training with OCR modality Perturbation for Scene-Text Visual Question Answering","publication_year":2024,"publication_date":"2024-07-15","ids":{"openalex":"https://openalex.org/W4402982652","doi":"https://doi.org/10.1109/icme57554.2024.10688200"},"language":"en","primary_location":{"id":"doi:10.1109/icme57554.2024.10688200","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icme57554.2024.10688200","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101318651","display_name":"Zhixuan Shen","orcid":null},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhixuan Shen","raw_affiliation_strings":["Southwest Jiaotong University,School of Computing and Artificial Intelligence,Chengdu,China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University,School of Computing and Artificial Intelligence,Chengdu,China","institution_ids":["https://openalex.org/I4800084"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085348492","display_name":"Haonan Luo","orcid":"https://orcid.org/0000-0002-9121-2687"},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haonan Luo","raw_affiliation_strings":["Southwest Jiaotong University,School of Computing and Artificial Intelligence,Chengdu,China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University,School of Computing and Artificial Intelligence,Chengdu,China","institution_ids":["https://openalex.org/I4800084"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100698042","display_name":"Sijia Li","orcid":"https://orcid.org/0000-0003-4952-4123"},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sijia Li","raw_affiliation_strings":["Southwest Jiaotong University,School of Computing and Artificial Intelligence,Chengdu,China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University,School of Computing and Artificial Intelligence,Chengdu,China","institution_ids":["https://openalex.org/I4800084"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5070559820","display_name":"Tianrui Li","orcid":"https://orcid.org/0000-0001-7780-104X"},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianrui Li","raw_affiliation_strings":["Southwest Jiaotong University,School of Computing and Artificial Intelligence,Chengdu,China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University,School of Computing and Artificial Intelligence,Chengdu,China","institution_ids":["https://openalex.org/I4800084"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5101318651"],"corresponding_institution_ids":["https://openalex.org/I4800084"],"apc_list":null,"apc_paid":null,"fwci":0.5263,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.66359832,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9735000133514404,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.783169150352478},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7416098713874817},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.6201791763305664},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6099185943603516},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.5958779454231262},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5514062643051147},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4418479800224304},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.427827388048172},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.40680354833602905},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3966664671897888},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3844534754753113}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.783169150352478},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7416098713874817},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.6201791763305664},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6099185943603516},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.5958779454231262},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5514062643051147},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4418479800224304},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.427827388048172},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.40680354833602905},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3966664671897888},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3844534754753113},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme57554.2024.10688200","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icme57554.2024.10688200","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320321543","display_name":"China Postdoctoral Science Foundation","ror":"https://ror.org/0426zh255"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W1933349210","https://openalex.org/W2090048052","https://openalex.org/W2745461083","https://openalex.org/W2963622213","https://openalex.org/W2979382951","https://openalex.org/W2997154779","https://openalex.org/W3004846386","https://openalex.org/W3013224334","https://openalex.org/W3034336960","https://openalex.org/W3095771422","https://openalex.org/W3106859150","https://openalex.org/W3108319047","https://openalex.org/W3175855397","https://openalex.org/W3176851559","https://openalex.org/W3181159501","https://openalex.org/W3205050305","https://openalex.org/W3215633354","https://openalex.org/W4205261319","https://openalex.org/W4304013646","https://openalex.org/W4304142414","https://openalex.org/W4312263373","https://openalex.org/W4385245566"],"related_works":["https://openalex.org/W2502115930","https://openalex.org/W2482350142","https://openalex.org/W4246396837","https://openalex.org/W3126451824","https://openalex.org/W4394050964","https://openalex.org/W3211393740","https://openalex.org/W3208049411","https://openalex.org/W3022908591","https://openalex.org/W4285706568","https://openalex.org/W2551249631"],"abstract_inverted_index":{"Scene-Text":[0],"Visual":[1],"Question":[2],"Answering":[3],"(ST-VQA)":[4],"aims":[5],"to":[6,16,50,87,109],"understand":[7],"scene":[8],"text":[9,18,46],"in":[10,80],"images":[11],"and":[12,34,43,135,138],"answer":[13],"questions":[14],"related":[15],"the":[17,26,81,111,115,133],"content.":[19],"Most":[20],"existing":[21],"methods":[22],"heavily":[23],"rely":[24],"on":[25,38,131],"accuracy":[27],"of":[28,84,91],"Optical":[29],"Character":[30],"Recognition":[31],"(OCR)":[32],"systems,":[33],"aggressive":[35],"fine-tuning":[36],"based":[37],"limited":[39],"spatial":[40,64,116],"location":[41],"information":[42,47],"erroneous":[44],"OCR":[45,72,85,92,99,119],"often":[48],"leads":[49],"inevitable":[51],"overfitting.":[52],"In":[53],"this":[54],"paper,":[55],"we":[56,68],"propose":[57],"a":[58,104,140],"multimodal":[59,144],"adversarial":[60,78,145],"training":[61,79],"architecture":[62],"with":[63],"awareness":[65],"capabilities.":[66],"Specifically,":[67],"introduce":[69],"an":[70],"Adversarial":[71],"Enhancement":[73],"(AOE)":[74],"module,":[75],"which":[76],"leverages":[77],"embedding":[82],"space":[83],"modality":[86],"enhance":[88],"fault-tolerant":[89],"representation":[90],"texts,":[93],"thereby":[94],"reducing":[95],"noise":[96],"caused":[97],"by":[98],"errors.":[100],"Simultaneously,":[101],"We":[102],"add":[103],"Spatial-Aware":[105],"Self-Attention":[106],"(SASA)":[107],"mechanism":[108],"help":[110],"model":[112],"better":[113],"capture":[114],"relationships":[117],"among":[118],"tokens.":[120],"Various":[121],"experiments":[122],"demonstrate":[123],"that":[124],"our":[125],"method":[126],"achieves":[127],"significant":[128],"performance":[129],"improvements":[130],"both":[132],"ST-VQA":[134],"TextVQA":[136],"datasets":[137],"provides":[139],"novel":[141],"paradigm":[142],"for":[143],"training.":[146]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2025-12-22T23:10:17.713674","created_date":"2025-10-10T00:00:00"}
