{"id":"https://openalex.org/W4412925420","doi":"https://doi.org/10.1007/s11042-025-21049-w","title":"Multi-modality guided cross-attention for visual question answering","display_name":"Multi-modality guided cross-attention for visual question answering","publication_year":2025,"publication_date":"2025-07-25","ids":{"openalex":"https://openalex.org/W4412925420","doi":"https://doi.org/10.1007/s11042-025-21049-w"},"language":"en","primary_location":{"id":"doi:10.1007/s11042-025-21049-w","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11042-025-21049-w","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11042-025-21049-w.pdf","source":{"id":"https://openalex.org/S110206669","display_name":"Multimedia Tools and Applications","issn_l":"1380-7501","issn":["1380-7501","1573-7721"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Multimedia Tools and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s11042-025-21049-w.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5084350939","display_name":"Muhammad Zeeshan Khan","orcid":"https://orcid.org/0000-0002-7905-3345"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Muhammad Zeeshan Khan","raw_affiliation_strings":["School of Information Technology, Deakin University, Waurn Ponds, Geelong, 3216, Victoria, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information Technology, Deakin University, Waurn Ponds, Geelong, 3216, Victoria, Australia","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024634557","display_name":"Duc Thanh Nguyen","orcid":"https://orcid.org/0000-0002-2285-2066"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Duc Thanh Nguyen","raw_affiliation_strings":["School of Information Technology, Deakin University, Waurn Ponds, Geelong, 3216, Victoria, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information Technology, Deakin University, Waurn Ponds, Geelong, 3216, Victoria, Australia","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101494233","display_name":"Thanh Thi Nguyen","orcid":"https://orcid.org/0000-0002-3738-4218"},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Thanh Thi Nguyen","raw_affiliation_strings":["Faculty of Information Technology, Monash University, Clayton, Melbourne, 3800, Victoria, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Faculty of Information Technology, Monash University, Clayton, Melbourne, 3800, Victoria, Australia","institution_ids":["https://openalex.org/I56590836"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072346367","display_name":"Anuroop Gaddam","orcid":"https://orcid.org/0000-0001-5112-9849"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Anuroop Gaddam","raw_affiliation_strings":["School of Information Technology, Deakin University, Waurn Ponds, Geelong, 3216, Victoria, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information Technology, Deakin University, Waurn Ponds, Geelong, 3216, Victoria, Australia","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5033585021","display_name":"Imran Razzak","orcid":"https://orcid.org/0000-0002-3930-6600"},"institutions":[{"id":"https://openalex.org/I31746571","display_name":"UNSW Sydney","ror":"https://ror.org/03r8z3t63","country_code":"AU","type":"education","lineage":["https://openalex.org/I31746571"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Imran Razzak","raw_affiliation_strings":["School of Computer Science and Engineering, University of New South Wales, Sydney, NSW, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of New South Wales, Sydney, NSW, Australia","institution_ids":["https://openalex.org/I31746571"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5084350939"],"corresponding_institution_ids":["https://openalex.org/I149704539"],"apc_list":null,"apc_paid":null,"fwci":1.0044,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.78616427,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"84","issue":"39","first_page":"47543","last_page":"47565"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9923999905586243,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8780243992805481},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.7429044842720032},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.6210393905639648},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3416610360145569},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3307208716869354},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3290141224861145}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8780243992805481},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.7429044842720032},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.6210393905639648},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3416610360145569},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3307208716869354},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3290141224861145}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1007/s11042-025-21049-w","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11042-025-21049-w","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11042-025-21049-w.pdf","source":{"id":"https://openalex.org/S110206669","display_name":"Multimedia Tools and Applications","issn_l":"1380-7501","issn":["1380-7501","1573-7721"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Multimedia Tools and Applications","raw_type":"journal-article"},{"id":"pmh:oai:figshare.com:article/29817959","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4377196282","display_name":"Figshare","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210132348","host_organization_name":"Figshare (United Kingdom)","host_organization_lineage":["https://openalex.org/I4210132348"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Journal contribution"}],"best_oa_location":{"id":"doi:10.1007/s11042-025-21049-w","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11042-025-21049-w","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11042-025-21049-w.pdf","source":{"id":"https://openalex.org/S110206669","display_name":"Multimedia Tools and Applications","issn_l":"1380-7501","issn":["1380-7501","1573-7721"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Multimedia Tools and Applications","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320320970","display_name":"Deakin University","ror":"https://ror.org/02czsnj07"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412925420.pdf","grobid_xml":"https://content.openalex.org/works/W4412925420.grobid-xml"},"referenced_works_count":52,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W1895577753","https://openalex.org/W1933349210","https://openalex.org/W1947481528","https://openalex.org/W1981276685","https://openalex.org/W2064675550","https://openalex.org/W2085425470","https://openalex.org/W2097117768","https://openalex.org/W2108598243","https://openalex.org/W2157331557","https://openalex.org/W2194775991","https://openalex.org/W2202226326","https://openalex.org/W2250539671","https://openalex.org/W2560730294","https://openalex.org/W2572649260","https://openalex.org/W2745461083","https://openalex.org/W2747623286","https://openalex.org/W2786356574","https://openalex.org/W2802922942","https://openalex.org/W2962749469","https://openalex.org/W2963028801","https://openalex.org/W2963037330","https://openalex.org/W2963097937","https://openalex.org/W2963150162","https://openalex.org/W2963163009","https://openalex.org/W2963191264","https://openalex.org/W2963260436","https://openalex.org/W2963383024","https://openalex.org/W2963622213","https://openalex.org/W2963644680","https://openalex.org/W2963954913","https://openalex.org/W2964138343","https://openalex.org/W2964157791","https://openalex.org/W3015965768","https://openalex.org/W3035517717","https://openalex.org/W3131825083","https://openalex.org/W3135705837","https://openalex.org/W3152635550","https://openalex.org/W3162090017","https://openalex.org/W3171841353","https://openalex.org/W3185066916","https://openalex.org/W3197360182","https://openalex.org/W4249013746","https://openalex.org/W4311773928","https://openalex.org/W4312645695","https://openalex.org/W4360858130","https://openalex.org/W4388666488","https://openalex.org/W4389684430","https://openalex.org/W4390727864","https://openalex.org/W4391341263","https://openalex.org/W6600195515","https://openalex.org/W6702248584"],"related_works":["https://openalex.org/W2384605597","https://openalex.org/W2387743295","https://openalex.org/W3082787378","https://openalex.org/W2136007095","https://openalex.org/W2366230879","https://openalex.org/W3208425359","https://openalex.org/W2349927912","https://openalex.org/W3159777597","https://openalex.org/W2385859805","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Abstract":[0],"Visual":[1],"Question":[2],"Answering":[3],"(VQA)":[4],"is":[5],"a":[6,47,99],"multimodality":[7],"research":[8],"domain":[9],"that":[10],"intersects":[11],"the":[12,41,87,166,176],"fields":[13],"of":[14,168,181],"computer":[15],"vision":[16],"and":[17,25,32,77,83,127,141,143,160],"natural":[18],"language":[19],"processing":[20,24],"for":[21,106,137,145],"visual-textual":[22],"data":[23],"understanding.":[26],"Traditional":[27],"VQA":[28,158,161],"methods":[29,53],"extract":[30],"visual":[31,76,138],"textual":[33,78,146],"features":[34,42],"from":[35,43,86],"pre-trained":[36],"architectures,":[37],"respectively,":[38],"then":[39],"combine":[40],"both":[44,184],"modalities":[45,112],"in":[46,183],"common":[48],"feature":[49,139,147],"space.":[50],"The":[51,71],"traditional":[52],"perform":[54],"well":[55,114],"on":[56,64,153,178],"high-level":[57],"perception":[58,66],"questions.":[59],"However,":[60],"attaining":[61],"high":[62],"accuracy":[63],"low-level":[65],"questions":[67,182],"still":[68],"remains":[69],"challenging.":[70],"difficulties":[72],"include":[73],"detecting":[74],"relevant":[75],"information,":[79],"building":[80,107],"meaningful":[81],"associations,":[82],"extracting":[84],"insights":[85],"multimodal":[88],"data.":[89],"To":[90],"address":[91],"these":[92],"challenges,":[93],"unlike":[94],"existing":[95,172],"approaches,":[96],"we":[97,119],"propose":[98],"novel":[100],"multi-modality":[101],"guided":[102],"cross":[103],"self-attention":[104],"mechanism":[105],"semantic":[108],"relationships":[109],"within":[110],"individual":[111],"as":[113,115],"between":[116],"them.":[117],"Specifically,":[118],"examine":[120],"visual-guided":[121],"cross-attention":[122,125,129],"(VGCA),":[123],"textual-guided":[124],"(TGCA),":[126],"multi-modality-guided":[128],"(MMGCA).":[130],"We":[131,149],"utilise":[132],"convolutional":[133],"neural":[134],"networks":[135],"(CNNs)":[136],"learning,":[140],"LSTM":[142],"FNET":[144],"learning.":[148],"evaluate":[150],"our":[151,169],"method":[152,170],"two":[154],"benchmark":[155],"datasets,":[156],"including":[157],"1.0":[159],"2.0.":[162],"Experimental":[163],"results":[164],"demonstrate":[165],"superiority":[167],"over":[171],"baselines":[173],"by":[174],"improving":[175],"performance":[177],"various":[179],"types":[180],"datasets.":[185]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
