{"id":"https://openalex.org/W7125957124","doi":"https://doi.org/10.1109/smc58881.2025.11343454","title":"Grounded Multi-modal Conversation for Zero-shot Visual Question Answering","display_name":"Grounded Multi-modal Conversation for Zero-shot Visual Question Answering","publication_year":2025,"publication_date":"2025-10-05","ids":{"openalex":"https://openalex.org/W7125957124","doi":"https://doi.org/10.1109/smc58881.2025.11343454"},"language":null,"primary_location":{"id":"doi:10.1109/smc58881.2025.11343454","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343454","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100631887","display_name":"Mohammad Reza Zarei","orcid":"https://orcid.org/0000-0002-3517-7864"},"institutions":[{"id":"https://openalex.org/I67031392","display_name":"Carleton University","ror":"https://ror.org/02qtvee93","country_code":"CA","type":"education","lineage":["https://openalex.org/I67031392"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Mohammad Reza Zarei","raw_affiliation_strings":["Carleton University,School of Computer Science,Ottawa,Canada"],"affiliations":[{"raw_affiliation_string":"Carleton University,School of Computer Science,Ottawa,Canada","institution_ids":["https://openalex.org/I67031392"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001084471","display_name":"Abbas Akkasi","orcid":"https://orcid.org/0000-0003-4700-4896"},"institutions":[{"id":"https://openalex.org/I67031392","display_name":"Carleton University","ror":"https://ror.org/02qtvee93","country_code":"CA","type":"education","lineage":["https://openalex.org/I67031392"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Abbas Akkasi","raw_affiliation_strings":["Carleton University,School of Computer Science,Ottawa,Canada"],"affiliations":[{"raw_affiliation_string":"Carleton University,School of Computer Science,Ottawa,Canada","institution_ids":["https://openalex.org/I67031392"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5122258333","display_name":"Majid Komeili","orcid":null},"institutions":[{"id":"https://openalex.org/I67031392","display_name":"Carleton University","ror":"https://ror.org/02qtvee93","country_code":"CA","type":"education","lineage":["https://openalex.org/I67031392"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Majid Komeili","raw_affiliation_strings":["Carleton University,School of Computer Science,Ottawa,Canada"],"affiliations":[{"raw_affiliation_string":"Carleton University,School of Computer Science,Ottawa,Canada","institution_ids":["https://openalex.org/I67031392"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100631887"],"corresponding_institution_ids":["https://openalex.org/I67031392"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.68786626,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"876","last_page":"883"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0008999999845400453,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.0006000000284984708,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.8073999881744385},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.6887000203132629},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.5821999907493591},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5702000260353088},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5321000218391418},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5126000046730042},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.48429998755455017},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.4790000021457672}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.8073999881744385},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.6887000203132629},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6581000089645386},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.5821999907493591},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5702000260353088},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5321000218391418},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5126000046730042},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.48429998755455017},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.4790000021457672},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.41670000553131104},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3953999876976013},{"id":"https://openalex.org/C64543145","wikidata":"https://www.wikidata.org/wiki/Q162942","display_name":"Intersection (aeronautics)","level":2,"score":0.3808000087738037},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3538999855518341},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3357999920845032},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3059000074863434},{"id":"https://openalex.org/C2777877512","wikidata":"https://www.wikidata.org/wiki/Q1116097","display_name":"Common ground","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.28870001435279846},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.2847000062465668},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2815000116825104},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.27889999747276306},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2759999930858612}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/smc58881.2025.11343454","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343454","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.8151779770851135}],"awards":[],"funders":[{"id":"https://openalex.org/F4320334593","display_name":"Natural Sciences and Engineering Research Council of Canada","ror":"https://ror.org/01h531d29"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2144554289","https://openalex.org/W2560730294","https://openalex.org/W2947312908","https://openalex.org/W2988326850","https://openalex.org/W3044175177","https://openalex.org/W3091588028","https://openalex.org/W3159131359","https://openalex.org/W3173220247","https://openalex.org/W3199693760","https://openalex.org/W4385445765","https://openalex.org/W4385571689","https://openalex.org/W4385572364","https://openalex.org/W4385573236","https://openalex.org/W4386076140","https://openalex.org/W4387968292","https://openalex.org/W4388209340","https://openalex.org/W4389520080","https://openalex.org/W4389520252","https://openalex.org/W4393159294","https://openalex.org/W4399881905","https://openalex.org/W4402392501","https://openalex.org/W4402670365","https://openalex.org/W4402811723","https://openalex.org/W4404612908"],"related_works":[],"abstract_inverted_index":{"Zero-shot":[0,140],"visual":[1,123,168,190,201],"question":[2,176],"answering":[3],"(VQA)":[4],"poses":[5],"a":[6,68,145,158,163],"formidable":[7],"challenge":[8],"at":[9,234],"the":[10,73,85,97,101,105,108,113,121,126,150,172,175,187,223],"intersection":[11],"of":[12,100,107,115,153,174,215],"computer":[13],"vision":[14],"and":[15,43,155,177,183,191,194,197,220],"natural":[16],"language":[17,36],"processing.":[18],"Traditionally,":[19],"this":[20,88,131,133],"problem":[21],"has":[22,90],"been":[23,58,91],"tackled":[24],"using":[25],"end-to-end":[26,218],"pre-trained":[27,109],"vision-language":[28],"models":[29,37],"(VLMs).":[30],"However,":[31,71],"recent":[32],"advancements":[33],"in":[34,50,67,75,93,157],"large":[35],"(LLMs)":[38],"demonstrate":[39],"their":[40],"exceptional":[41],"reasoning":[42],"comprehension":[44,196],"abilities,":[45],"making":[46],"them":[47],"valuable":[48],"assets":[49],"multi-modal":[51],"tasks,":[52],"including":[53],"zero-shot":[54,65],"VQA.":[55],"LLMs":[56,154],"have":[57],"previously":[59],"integrated":[60],"with":[61],"VLMs":[62,156,219],"to":[63,111,120,166,171],"solve":[64],"VQA":[66,76,209],"conversation-based":[69,159],"approach.":[70,160],"while":[72],"focus":[74,169],"tasks":[77],"is":[78,118,231],"often":[79],"on":[80,104],"specific":[81],"regions":[82],"rather":[83],"than":[84],"entire":[86,127],"image,":[87],"aspect":[89],"overlooked":[92],"previous":[94],"approaches.":[95],"Consequently,":[96],"overall":[98],"performance":[99],"framework":[102,147],"relies":[103],"ability":[106],"VLM":[110,182],"locate":[112],"region":[114],"interest":[116],"that":[117,148],"relevant":[119],"requested":[122],"information":[124],"within":[125],"image.":[128],"To":[129],"address":[130],"challenge,":[132],"paper":[134],"proposes":[135],"Grounded":[136],"Multi-modal":[137],"Conversation":[138],"for":[139,200],"Visual":[141],"Question":[142],"Answering":[143],"(GMC-VQA),":[144],"region-based":[146],"leverages":[149],"complementary":[151],"strengths":[152],"We":[161,203],"employ":[162],"grounding":[164],"mechanism":[165],"refine":[167],"according":[170],"semantics":[173],"foster":[178],"collaborative":[179],"interaction":[180],"between":[181,189],"LLM,":[184],"effectively":[185],"bridging":[186],"gap":[188],"textual":[192],"modalities":[193],"enhancing":[195],"response":[198],"generation":[199],"queries.":[202],"evaluate":[204],"GMC-VQA":[205],"across":[206],"three":[207],"diverse":[208],"datasets,":[210],"achieving":[211],"substantial":[212],"average":[213],"improvements":[214],"10.04%":[216],"over":[217,222],"2.52%":[221],"state-of-the-art":[224],"VLM-LLM":[225],"communication-based":[226],"framework,":[227],"respectively.":[228],"Our":[229],"code":[230],"publicly":[232],"available":[233],"https://github.com/mrzarei5/GMC-VQA.":[235]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-01-29T00:00:00"}
