{"id":"https://openalex.org/W4415708918","doi":"https://doi.org/10.1109/icme59968.2025.11210062","title":"Multi-Passage Retrieval-Augmented Multimodal Language Generation Model for Knowledge-Based Visual Question Answering","display_name":"Multi-Passage Retrieval-Augmented Multimodal Language Generation Model for Knowledge-Based Visual Question Answering","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415708918","doi":"https://doi.org/10.1109/icme59968.2025.11210062"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11210062","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210062","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5051607982","display_name":"Siyu Cheng","orcid":"https://orcid.org/0009-0007-3554-4365"},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Siyu Cheng","raw_affiliation_strings":["Hunan University,College of Computer Science and Electronic Engineering,Changsha,China"],"affiliations":[{"raw_affiliation_string":"Hunan University,College of Computer Science and Electronic Engineering,Changsha,China","institution_ids":["https://openalex.org/I16609230"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100604018","display_name":"Chao Yang","orcid":"https://orcid.org/0000-0001-8774-8115"},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Yang","raw_affiliation_strings":["Hunan University,College of Computer Science and Electronic Engineering,Changsha,China"],"affiliations":[{"raw_affiliation_string":"Hunan University,College of Computer Science and Electronic Engineering,Changsha,China","institution_ids":["https://openalex.org/I16609230"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100635445","display_name":"Bin Jiang","orcid":"https://orcid.org/0000-0002-9153-4360"},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bin Jiang","raw_affiliation_strings":["Hunan University,College of Computer Science and Electronic Engineering,Changsha,China"],"affiliations":[{"raw_affiliation_string":"Hunan University,College of Computer Science and Electronic Engineering,Changsha,China","institution_ids":["https://openalex.org/I16609230"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5051607982"],"corresponding_institution_ids":["https://openalex.org/I16609230"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31206264,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9907000064849854,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9907000064849854,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0024999999441206455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.0008999999845400453,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.8495000004768372},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.7229999899864197},{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.7131999731063843},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5853999853134155},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.48030000925064087},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.44369998574256897}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.8495000004768372},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7872999906539917},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.7229999899864197},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.7131999731063843},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6273999810218811},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5853999853134155},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5372999906539917},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.48030000925064087},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.44369998574256897},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.3278000056743622},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3264999985694885},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2784000039100647},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2599000036716461},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.25679999589920044}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11210062","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210062","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1933349210","https://openalex.org/W2947312908","https://openalex.org/W2962985038","https://openalex.org/W2998702515","https://openalex.org/W3099700870","https://openalex.org/W3101703188","https://openalex.org/W3139224848","https://openalex.org/W3156789018","https://openalex.org/W3172845486","https://openalex.org/W3196798856","https://openalex.org/W3199693760","https://openalex.org/W4226321975","https://openalex.org/W4283777642","https://openalex.org/W4312846625","https://openalex.org/W4312971273","https://openalex.org/W4384652670","https://openalex.org/W4385565470","https://openalex.org/W4385571214","https://openalex.org/W4385574177","https://openalex.org/W4386065596"],"related_works":[],"abstract_inverted_index":{"Knowledge-based":[0],"Visual":[1],"Question":[2],"Answering":[3],"is":[4,137],"a":[5,10,21,52,110,117,138,159,174],"challenging":[6],"task":[7],"that":[8],"requires":[9],"VQA":[11,160],"system":[12],"to":[13,17,37,62,74,84],"utilize":[14,63,123],"external":[15],"knowledge":[16,89,130],"answer":[18,133],"questions":[19],"about":[20],"given":[22],"image.":[23],"Most":[24],"current":[25],"retrieval-augmented":[26,171],"methods":[27,35],"suffer":[28],"from":[29,152],"two":[30],"notable":[31],"limitations.":[32],"Firstly,":[33],"these":[34],"tend":[36],"convert":[38],"images":[39],"into":[40],"plain":[41],"text":[42],"and":[43,116,132],"subsequently":[44],"discard":[45],"the":[46,93,124,165,169],"original":[47,125],"image":[48,126],"input,":[49],"resulting":[50],"in":[51],"loss":[53],"of":[54,88,95,144,162],"valuable":[55],"visual":[56],"information.":[57],"Secondly,":[58],"they":[59],"often":[60],"fail":[61],"multiple":[64,80,153],"retrieved":[65],"passages":[66],"effectively,":[67],"typically":[68],"depending":[69],"on":[70,164],"just":[71],"one":[72],"passage":[73],"generate":[75],"answers":[76,147],"or":[77],"directly":[78,127],"concatenating":[79],"passages,":[81],"which":[82],"leads":[83],"excessively":[85],"long":[86],"sequences":[87],"input.":[90],"To":[91],"overcome":[92],"limitations":[94],"previous":[96],"methods,":[97],"we":[98],"introduce":[99],"an":[100],"innovative":[101],"retriever-generator":[102],"framework":[103,108],"for":[104,128],"knowledge-based":[105],"VQA.":[106],"This":[107],"comprises":[109],"Multimodal":[111],"Queries-oriented":[112],"Knowledge":[113],"Retriever":[114],"(MQ-KR)":[115],"Multi-Passage":[118],"Retrieval-Augmented":[119],"Generator":[120],"(MP-RAG).":[121],"We":[122],"both":[129],"retrieval":[131],"generation.":[134],"The":[135],"generator":[136],"multimodal":[139],"language":[140],"generation":[141],"model":[142],"capable":[143],"producing":[145],"accurate":[146],"by":[148,173],"effectively":[149],"aggregating":[150],"evidence":[151],"passages.":[154],"Our":[155],"proposed":[156],"method":[157,172],"achieves":[158],"Score":[161],"59.89%":[163],"OK-VQA":[166],"dataset,":[167],"surpassing":[168],"SOTA":[170],"large":[175],"margin":[176],"(+4.37%).":[177]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-30T00:00:00"}
