{"id":"https://openalex.org/W4406890284","doi":"https://doi.org/10.26599/bdma.2024.9020079","title":"Seeing and Reasoning: A Simple Deep Learning Approach to Visual Question Answering","display_name":"Seeing and Reasoning: A Simple Deep Learning Approach to Visual Question Answering","publication_year":2025,"publication_date":"2025-01-28","ids":{"openalex":"https://openalex.org/W4406890284","doi":"https://doi.org/10.26599/bdma.2024.9020079"},"language":"en","primary_location":{"id":"doi:10.26599/bdma.2024.9020079","is_oa":true,"landing_page_url":"https://doi.org/10.26599/bdma.2024.9020079","pdf_url":null,"source":{"id":"https://openalex.org/S4210209060","display_name":"Big Data Mining and Analytics","issn_l":"2096-0654","issn":["2096-0654"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311901","host_organization_name":"Tsinghua University Press","host_organization_lineage":["https://openalex.org/P4310311901"],"host_organization_lineage_names":["Tsinghua University Press"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Big Data Mining and Analytics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.26599/bdma.2024.9020079","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052589361","display_name":"Rufai Yusuf Zakari","orcid":"https://orcid.org/0000-0002-4645-6412"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Rufai Yusuf Zakari","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068942427","display_name":"Jim Wilson Owusu","orcid":"https://orcid.org/0000-0003-0425-3761"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jim Wilson Owusu","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070150374","display_name":"Ke Qin","orcid":"https://orcid.org/0000-0001-6174-3877"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ke Qin","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100761497","display_name":"Tao He","orcid":"https://orcid.org/0000-0001-8676-7429"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao He","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101878135","display_name":"Guangchun Luo","orcid":null},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangchun Luo","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731","institution_ids":["https://openalex.org/I150229711"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5052589361"],"corresponding_institution_ids":["https://openalex.org/I150229711"],"apc_list":null,"apc_paid":null,"fwci":5.2971,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.94804928,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":"8","issue":"2","first_page":"458","last_page":"478"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9657999873161316,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9236000180244446,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.8374706506729126},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.6602996587753296},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5803462266921997},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5038854479789734},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.36846962571144104},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.3469098210334778},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.2656080424785614},{"id":"https://openalex.org/keywords/epistemology","display_name":"Epistemology","score":0.16896846890449524},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.08801805973052979}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.8374706506729126},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.6602996587753296},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5803462266921997},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5038854479789734},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.36846962571144104},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.3469098210334778},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2656080424785614},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.16896846890449524},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.08801805973052979}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.26599/bdma.2024.9020079","is_oa":true,"landing_page_url":"https://doi.org/10.26599/bdma.2024.9020079","pdf_url":null,"source":{"id":"https://openalex.org/S4210209060","display_name":"Big Data Mining and Analytics","issn_l":"2096-0654","issn":["2096-0654"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311901","host_organization_name":"Tsinghua University Press","host_organization_lineage":["https://openalex.org/P4310311901"],"host_organization_lineage_names":["Tsinghua University Press"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Big Data Mining and Analytics","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:5bbbc5e940b64e5f8a3949fbf6ce24ab","is_oa":true,"landing_page_url":"https://doaj.org/article/5bbbc5e940b64e5f8a3949fbf6ce24ab","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Big Data Mining and Analytics, Vol 8, Iss 2, Pp 458-478 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.26599/bdma.2024.9020079","is_oa":true,"landing_page_url":"https://doi.org/10.26599/bdma.2024.9020079","pdf_url":null,"source":{"id":"https://openalex.org/S4210209060","display_name":"Big Data Mining and Analytics","issn_l":"2096-0654","issn":["2096-0654"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311901","host_organization_name":"Tsinghua University Press","host_organization_lineage":["https://openalex.org/P4310311901"],"host_organization_lineage_names":["Tsinghua University Press"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Big Data Mining and Analytics","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.4399999976158142,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W1933349210","https://openalex.org/W2250539671","https://openalex.org/W2560730294","https://openalex.org/W2561715562","https://openalex.org/W2745461083","https://openalex.org/W2760103357","https://openalex.org/W2916723116","https://openalex.org/W2946752867","https://openalex.org/W2949686305","https://openalex.org/W2962716332","https://openalex.org/W2962781483","https://openalex.org/W2963176022","https://openalex.org/W2963224792","https://openalex.org/W2963518342","https://openalex.org/W2963656855","https://openalex.org/W2963760481","https://openalex.org/W2963954913","https://openalex.org/W2964118342","https://openalex.org/W2966683369","https://openalex.org/W2970231061","https://openalex.org/W3005881764","https://openalex.org/W3011329351","https://openalex.org/W3026441479","https://openalex.org/W3100712674","https://openalex.org/W3131825083","https://openalex.org/W3141200244","https://openalex.org/W3156729876","https://openalex.org/W3197035208","https://openalex.org/W3203354307","https://openalex.org/W4229003843","https://openalex.org/W4285200267","https://openalex.org/W4310092748","https://openalex.org/W4311773928","https://openalex.org/W4387675775","https://openalex.org/W6620707391","https://openalex.org/W6638318767","https://openalex.org/W6684821475","https://openalex.org/W6736769356","https://openalex.org/W6738893770","https://openalex.org/W6748270630","https://openalex.org/W6751796012","https://openalex.org/W6755207826","https://openalex.org/W6757902542","https://openalex.org/W6765591853","https://openalex.org/W6766904570","https://openalex.org/W6766978945","https://openalex.org/W6768817161","https://openalex.org/W6848494730"],"related_works":["https://openalex.org/W2384605597","https://openalex.org/W2387743295","https://openalex.org/W3082787378","https://openalex.org/W2136007095","https://openalex.org/W2366230879","https://openalex.org/W3208425359","https://openalex.org/W1585007175","https://openalex.org/W2349927912","https://openalex.org/W3159777597","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Visual":[0],"Question":[1],"Answering":[2],"(VQA)":[3],"is":[4],"a":[5,10,39,45,80,96],"complex":[6,109],"task":[7,173],"that":[8,49],"requires":[9],"deep":[11],"understanding":[12],"of":[13,124,142,159,181],"both":[14,187],"visual":[15,31,60,194],"content":[16],"and":[17,29,33,61,92,131,150,190],"natural":[18],"language":[19],"questions.":[20],"The":[21,64],"challenge":[22],"lies":[23],"in":[24,38,193],"enabling":[25,71],"models":[26],"to":[27,34,54,73,75,86,106,153,172],"recognize":[28],"interpret":[30],"elements":[32],"reason":[35],"through":[36],"questions":[37],"multi-step,":[40],"compositional":[41],"manner.":[42],"We":[43],"propose":[44],"novel":[46],"Transformer-based":[47],"model":[48,65,105,191],"introduces":[50],"specialized":[51],"tokenization":[52],"techniques":[53],"effectively":[55],"capture":[56],"intricate":[57],"relationships":[58],"between":[59],"textual":[62],"features.":[63],"employs":[66],"an":[67],"enhanced":[68],"self-attention":[69],"mechanism,":[70,152],"it":[72],"attend":[74],"multiple":[76],"modalities":[77],"simultaneously,":[78],"while":[79],"co-attention":[81,151],"unit":[82],"dynamically":[83],"guides":[84],"focus":[85],"the":[87,104,118,139,147,154,160,166,179],"most":[88],"relevant":[89],"image":[90],"regions":[91],"question":[93],"components.":[94],"Additionally,":[95],"multi-step":[97],"reasoning":[98,110,148,169,188,195],"module":[99,149],"supports":[100],"iterative":[101],"inference,":[102],"allowing":[103],"excel":[107],"at":[108],"tasks.":[111,196],"Extensive":[112],"experiments":[113],"on":[114,126,129,133],"benchmark":[115],"datasets":[116],"demonstrate":[117],"model's":[119,155,167],"superior":[120],"performance,":[121],"with":[122],"accuracies":[123],"98.6%":[125],"CLEVR,":[127],"63.78%":[128],"GQA,":[130],"68.67%":[132],"VQA":[134],"v2.0.":[135],"Ablation":[136],"studies":[137],"confirm":[138],"critical":[140],"contribution":[141],"key":[143],"components,":[144],"such":[145],"as":[146],"effectiveness.":[156],"Qualitative":[157],"analysis":[158],"learned":[161],"attention":[162],"distributions":[163],"further":[164],"illustrates":[165],"dynamic":[168],"process,":[170],"adapting":[171],"complexity.":[174],"Overall,":[175],"our":[176],"study":[177],"advances":[178],"adaptation":[180],"Transformer":[182],"architectures":[183],"for":[184],"VQA,":[185],"enhancing":[186],"capabilities":[189],"interpretability":[192]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-22T08:09:32.410652","created_date":"2025-10-10T00:00:00"}
