{"id":"https://openalex.org/W7123360743","doi":"https://doi.org/10.1109/tmm.2026.3651096","title":"Toward Top-Down Reasoning: An Explainable Multi-Agent Approach for Visual Question Answering","display_name":"Toward Top-Down Reasoning: An Explainable Multi-Agent Approach for Visual Question Answering","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7123360743","doi":"https://doi.org/10.1109/tmm.2026.3651096"},"language":null,"primary_location":{"id":"doi:10.1109/tmm.2026.3651096","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3651096","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zeqing Wang","orcid":"https://orcid.org/0009-0006-6389-6678"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zeqing Wang","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0006-6389-6678","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122851111","display_name":"Wentao Wan","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wentao Wan","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0004-2063-4382","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120846476","display_name":"Qiqing Lao","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiqing Lao","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122904623","display_name":"Runmeng Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I187400657","display_name":"South China Normal University","ror":"https://ror.org/01kq0pv72","country_code":"CN","type":"education","lineage":["https://openalex.org/I187400657"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Runmeng Chen","raw_affiliation_strings":["South China Normal University, Guangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"South China Normal University, Guangzhou, China","institution_ids":["https://openalex.org/I187400657"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111084096","display_name":"Minjie Lang","orcid":null},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minjie Lang","raw_affiliation_strings":["Northeastern University, Shenyang, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northeastern University, Shenyang, China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122871869","display_name":"Xiao Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I143868143","display_name":"Anhui University","ror":"https://ror.org/05th6yx34","country_code":"CN","type":"education","lineage":["https://openalex.org/I143868143"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiao Wang","raw_affiliation_strings":["Anhui University, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-0008-0659","affiliations":[{"raw_affiliation_string":"Anhui University, Hefei, China","institution_ids":["https://openalex.org/I143868143"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102018879","display_name":"Feng Gao","orcid":"https://orcid.org/0000-0002-4086-5776"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feng Gao","raw_affiliation_strings":["Peking University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-1843-3180","affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114763704","display_name":"Keze Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Keze Wang","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-7817-8306","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5122847691","display_name":"Liang Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang Lin","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-2248-3755","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I157773358"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.05952088,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"28","issue":null,"first_page":"3081","last_page":"3096"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.868399977684021},{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8510000109672546},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.6173999905586243},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6110000014305115},{"id":"https://openalex.org/keywords/commonsense-reasoning","display_name":"Commonsense reasoning","score":0.5891000032424927},{"id":"https://openalex.org/keywords/expansive","display_name":"Expansive","score":0.5619000196456909},{"id":"https://openalex.org/keywords/commonsense-knowledge","display_name":"Commonsense knowledge","score":0.5532000064849854},{"id":"https://openalex.org/keywords/knowledge-base","display_name":"Knowledge base","score":0.5393000245094299},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.3749000132083893}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.868399977684021},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8510000109672546},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8126999735832214},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.6173999905586243},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6110000014305115},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.5891000032424927},{"id":"https://openalex.org/C2780502288","wikidata":"https://www.wikidata.org/wiki/Q28838156","display_name":"Expansive","level":3,"score":0.5619000196456909},{"id":"https://openalex.org/C30542707","wikidata":"https://www.wikidata.org/wiki/Q1603203","display_name":"Commonsense knowledge","level":3,"score":0.5532000064849854},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.5393000245094299},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4828999936580658},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3928000032901764},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3749000132083893},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.373199999332428},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3544999957084656},{"id":"https://openalex.org/C115925183","wikidata":"https://www.wikidata.org/wiki/Q1412694","display_name":"Knowledge-based systems","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C120567893","wikidata":"https://www.wikidata.org/wiki/Q1582085","display_name":"Knowledge extraction","level":2,"score":0.34779998660087585},{"id":"https://openalex.org/C149106765","wikidata":"https://www.wikidata.org/wiki/Q1982453","display_name":"Denotation (semiotics)","level":3,"score":0.3287999927997589},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.3077999949455261},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.29429998993873596},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C49929091","wikidata":"https://www.wikidata.org/wiki/Q1930471","display_name":"General knowledge","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C2780613888","wikidata":"https://www.wikidata.org/wiki/Q6423394","display_name":"Knowledge retrieval","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.26589998602867126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2026.3651096","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3651096","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1099008978","display_name":null,"funder_award_id":"23hytd006","funder_id":"https://openalex.org/F4320321160","funder_display_name":"Sun Yat-sen University"},{"id":"https://openalex.org/G327634725","display_name":null,"funder_award_id":"62276283","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6612841016","display_name":null,"funder_award_id":"2023A1515012985","funder_id":"https://openalex.org/F4320337111","funder_display_name":"Basic and Applied Basic Research Foundation of Guangdong Province"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320321160","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null},{"id":"https://openalex.org/F4320337111","display_name":"Basic and Applied Basic Research Foundation of Guangdong Province","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"methods":[1],"to":[2,21,31,73,94,117,155,172,175,221],"enhance":[3,95],"Vision-Language":[4],"Models":[5,34],"(VLMs)":[6],"for":[7,150,189],"Visual":[8],"Question":[9],"Answering":[10],"(VQA)":[11],"have":[12],"focused":[13],"on":[14,166,231,256,260],"strengthening":[15],"their":[16],"inference":[17],"capabilities,":[18],"enabling":[19],"them":[20],"tackle":[22],"VQA":[23,48,122,225,233],"tasks":[24],"independently":[25],"rather":[26],"than":[27],"merely":[28],"as":[29],"aids":[30],"Large":[32],"Language":[33],"(LLMs).":[35],"However,":[36],"these":[37],"approaches":[38],"often":[39],"ignore":[40],"the":[41,46,52,56,63,89,96,120,131,147,151,173,177,190,196,210,216,223,241,250,263],"rich":[42],"commonsense":[43],"knowledge":[44,91,199,208],"inside":[45],"given":[47,121,191],"image":[49],"sampled":[50],"from":[51,209],"real":[53],"world,":[54],"limiting":[55],"full":[57],"potential":[58],"of":[59,92,98,200,238,246],"VLMs.":[60],"Inspired":[61],"by":[62,87,124,194],"human":[64],"top-down":[65,137],"reasoning":[66,138],"process,":[67],"i.e.,":[68,106],"systematically":[69],"exploring":[70],"relevant":[71,127,157,169],"issues":[72,128,170],"derive":[74],"a":[75,81,136,184,236],"comprehensive":[76,229],"answer,":[77],"this":[78],"work":[79],"introduces":[80],"novel,":[82],"explainable":[83],"multi-agent":[84],"collaboration":[85],"framework":[86,102,248],"leveraging":[88,195],"expansive":[90],"LLMs":[93],"capabilities":[97],"VLMs":[99,239],"themselves.":[100],"Our":[101],"comprises":[103],"three":[104],"agents,":[105],"<italic":[107,110,114,142,160,178,203,211,217],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[108,111,115,143,161,179,204,212,218],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Responder</i>,":[109],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Seeker</i>,":[112],"and":[113,129,153,182,215,228,244,258],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Integrator</i>,":[116],"collaboratively":[118],"answer":[119,133,148],"question":[123,152,174],"seeking":[125],"its":[126],"generating":[130],"final":[132,224],"in":[134,262],"such":[135],"process.":[139],"The":[140,159,202],"VLM-based":[141],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Responder</i>":[144,180,219],"agent":[145,181,206,214,220],"generates":[146],"candidates":[149],"responds":[154],"other":[156],"issues.":[158],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Seeker</i>":[162,213],"agent,":[163],"primarily":[164],"based":[165],"LLM,":[167],"identifies":[168],"related":[171],"inform":[176],"constructs":[183],"Multi-View":[185],"Knowledge":[186],"Base":[187],"(MVKB)":[188],"visual":[192],"scene":[193],"build-in":[197],"world":[198],"LLM.":[201],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Integrator</i>":[205],"combines":[207],"produce":[222],"answer.":[226],"Extensive":[227],"evaluations":[230],"diverse":[232],"datasets":[234],"with":[235],"variety":[237],"demonstrate":[240],"superior":[242],"performance":[243],"interpretability":[245],"our":[247],"over":[249],"baseline":[251],"method,":[252],"e.g.,":[253],"5.7%":[254],"improvement":[255],"VQA-RAD":[257],"5.2%":[259],"Winoground":[261],"zero-shot":[264],"setting":[265],"without":[266],"extra":[267],"training":[268],"cost.":[269]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-05-16T08:24:45.110214","created_date":"2026-01-14T00:00:00"}
