{"id":"https://openalex.org/W7160294855","doi":"https://doi.org/10.1109/wacv61042.2026.00305","title":"Crafting Descriptive Information for a Zero-shot Method to Improve Knowledge-Based Visual Question Answering Performance","display_name":"Crafting Descriptive Information for a Zero-shot Method to Improve Knowledge-Based Visual Question Answering Performance","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7160294855","doi":"https://doi.org/10.1109/wacv61042.2026.00305"},"language":null,"primary_location":{"id":"doi:10.1109/wacv61042.2026.00305","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61042.2026.00305","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135319011","display_name":"Mohammad Mahdi Moradi","orcid":null},"institutions":[{"id":"https://openalex.org/I60158472","display_name":"Concordia University","ror":"https://ror.org/0420zvk78","country_code":"CA","type":"education","lineage":["https://openalex.org/I60158472"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Mohammad Mahdi Moradi","raw_affiliation_strings":["Concordia University,Montreal,Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Concordia University,Montreal,Canada","institution_ids":["https://openalex.org/I60158472"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5135389849","display_name":"Sudhir Mudur","orcid":null},"institutions":[{"id":"https://openalex.org/I60158472","display_name":"Concordia University","ror":"https://ror.org/0420zvk78","country_code":"CA","type":"education","lineage":["https://openalex.org/I60158472"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Sudhir Mudur","raw_affiliation_strings":["Concordia University,Montreal,Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Concordia University,Montreal,Canada","institution_ids":["https://openalex.org/I60158472"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5135319011"],"corresponding_institution_ids":["https://openalex.org/I60158472"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.92971542,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"3120","last_page":"3128"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9305999875068665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9305999875068665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.006399999838322401,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.006300000008195639,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.623199999332428},{"id":"https://openalex.org/keywords/data-collection","display_name":"Data collection","score":0.2971000075340271},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.2912999987602234},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.2784999907016754},{"id":"https://openalex.org/keywords/information-system","display_name":"Information system","score":0.27459999918937683}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.623199999332428},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.571399986743927},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38019999861717224},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3531999886035919},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3402999937534332},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.2971000075340271},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2791000008583069},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2784999907016754},{"id":"https://openalex.org/C180198813","wikidata":"https://www.wikidata.org/wiki/Q121182","display_name":"Information system","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C3019144022","wikidata":"https://www.wikidata.org/wiki/Q4124998","display_name":"Questions and answers","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.267300009727478},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2583000063896179}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/wacv61042.2026.00305","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61042.2026.00305","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.6074193120002747}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1933349210","https://openalex.org/W2016089260","https://openalex.org/W2529436507","https://openalex.org/W2560730294","https://openalex.org/W2745461083","https://openalex.org/W2916723116","https://openalex.org/W2947312908","https://openalex.org/W2962735233","https://openalex.org/W2963383024","https://openalex.org/W2963672682","https://openalex.org/W2963954913","https://openalex.org/W2965276171","https://openalex.org/W3139224848","https://openalex.org/W3199693760","https://openalex.org/W4226452284","https://openalex.org/W4312846625","https://openalex.org/W4319301039","https://openalex.org/W4385572364","https://openalex.org/W4385574156","https://openalex.org/W4386065596","https://openalex.org/W4386076140","https://openalex.org/W4387968292","https://openalex.org/W4391133530","https://openalex.org/W4392504399","https://openalex.org/W4404612908","https://openalex.org/W4404782331","https://openalex.org/W4406363714","https://openalex.org/W7126447397","https://openalex.org/W7133196460","https://openalex.org/W7133208651","https://openalex.org/W7133220561"],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"GC-KBVQA,":[2],"a":[3,128],"zero-shot":[4,65,134],"framework":[5,29,87],"for":[6,133],"knowledge-based":[7],"visual":[8,34],"question":[9],"answering":[10],"(KB-VQA)":[11],"that":[12,104],"requires":[13],"no":[14],"additional":[15],"training.":[16],"GC-KBVQA":[17,62,126],"leverages":[18],"pretrained":[19],"models":[20],"together":[21,45],"with":[22,98],"carefully":[23],"designed,":[24],"context-aware":[25],"descriptive":[26],"information.":[27],"The":[28,86],"integrates":[30],"three":[31],"modules\u2014(i)":[32],"question-guided":[33],"grounding,":[35,105],"(ii)":[36],"semantics-based":[37],"caption":[38],"filtering,":[39],"and":[40,54,74,76,108,112,124,130],"(iii)":[41],"inter-stage":[42,113],"feedback\u2014that":[43],"work":[44],"to":[46,69,96,117],"generate":[47],"concise,":[48],"relevant":[49],"prompts":[50],"while":[51],"reducing":[52],"hallucinations":[53],"noisy":[55],"auxiliary":[56],"text.":[57],"Despite":[58],"its":[59],"lightweight":[60],"design,":[61],"outperforms":[63],"strong":[64],"baselines":[66],"by":[67],"up":[68],"+10.97%":[70],"on":[71],"OKVQA,":[72],"A-OKVQA,":[73],"VQAv2,":[75],"approaches":[77],"the":[78],"performance":[79],"of":[80],"few-shot":[81],"systems":[82],"without":[83],"labeled":[84],"data.":[85],"is":[88],"model-agnostic,":[89],"maintaining":[90],"effectiveness":[91],"across":[92],"LLMs":[93],"from":[94],"TinyLLaMA-1B":[95],"Llama3-8B":[97],"minimal":[99],"degradation.":[100],"Ablation":[101],"studies":[102],"confirm":[103],"dual-caption":[106],"generation,":[107],"both":[109],"intra-stage":[110],"filtering":[111],"feedback":[114],"each":[115],"contribute":[116],"accuracy":[118],"improvements.":[119],"By":[120],"combining":[121],"efficiency,":[122],"robustness,":[123],"modularity,":[125],"provides":[127],"practical":[129],"scalable":[131],"direction":[132],"KB-VQA.":[135]},"counts_by_year":[],"updated_date":"2026-05-07T06:04:25.777469","created_date":"2026-05-06T00:00:00"}
