{"id":"https://openalex.org/W7161146263","doi":"https://doi.org/10.48550/arxiv.2605.13277","title":"Utility-Oriented Visual Evidence Selection for Multimodal Retrieval-Augmented Generation","display_name":"Utility-Oriented Visual Evidence Selection for Multimodal Retrieval-Augmented Generation","publication_year":2026,"publication_date":"2026-05-13","ids":{"openalex":"https://openalex.org/W7161146263","doi":"https://doi.org/10.48550/arxiv.2605.13277"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.13277","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.13277","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.13277","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036629858","display_name":"Weiqing Luo","orcid":"https://orcid.org/0009-0004-8041-3258"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Weiqing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101387266","display_name":"ZongYe Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Zongye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136159068","display_name":"Xiao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136177720","display_name":"Zhiyuan Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Zhiyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136162063","display_name":"Haofeng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Haofeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136114942","display_name":"Ziyi Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Ziyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9894999861717224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9894999861717224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.002899999963119626,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0010000000474974513,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.6499000191688538},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.5587000250816345},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.5566999912261963},{"id":"https://openalex.org/keywords/helpfulness","display_name":"Helpfulness","score":0.5526999831199646},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.5026000142097473},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.4733999967575073},{"id":"https://openalex.org/keywords/empirical-evidence","display_name":"Empirical evidence","score":0.46149998903274536},{"id":"https://openalex.org/keywords/latent-variable","display_name":"Latent variable","score":0.42800000309944153}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7394999861717224},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.6499000191688538},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5909000039100647},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5629000067710876},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.5587000250816345},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.5566999912261963},{"id":"https://openalex.org/C2781265381","wikidata":"https://www.wikidata.org/wiki/Q5710255","display_name":"Helpfulness","level":2,"score":0.5526999831199646},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.5026000142097473},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.4733999967575073},{"id":"https://openalex.org/C166052673","wikidata":"https://www.wikidata.org/wiki/Q83021","display_name":"Empirical evidence","level":2,"score":0.46149998903274536},{"id":"https://openalex.org/C51167844","wikidata":"https://www.wikidata.org/wiki/Q4422623","display_name":"Latent variable","level":2,"score":0.42800000309944153},{"id":"https://openalex.org/C65965080","wikidata":"https://www.wikidata.org/wiki/Q1806885","display_name":"Latent variable model","level":3,"score":0.36730000376701355},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.34049999713897705},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.31189998984336853},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.30799999833106995},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C112933361","wikidata":"https://www.wikidata.org/wiki/Q2845258","display_name":"Probabilistic latent semantic analysis","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.2705000042915344},{"id":"https://openalex.org/C182365436","wikidata":"https://www.wikidata.org/wiki/Q50701","display_name":"Variable (mathematics)","level":2,"score":0.25459998846054077}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.13277","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.13277","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.13277","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.13277","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Visual":[0],"evidence":[1,33,40,48,73,83,106],"selection":[2,41],"is":[3,91],"a":[4,56,69,99],"critical":[5],"component":[6],"of":[7,31,64,72],"multimodal":[8,39,110],"retrieval-augmented":[9],"generation":[10],"(RAG),":[11],"yet":[12],"existing":[13],"methods":[14],"typically":[15],"rely":[16],"on":[17,55,87,113],"semantic":[18],"relevance":[19],"or":[20],"surface-level":[21],"similarity,":[22],"which":[23],"are":[24],"often":[25],"misaligned":[26],"with":[27],"the":[28,51,62],"actual":[29],"utility":[30,49,107],"visual":[32],"for":[34],"downstream":[35],"reasoning.":[36],"We":[37,96],"reformulate":[38],"from":[42],"an":[43],"information-theoretic":[44],"perspective":[45],"by":[46,84],"defining":[47],"as":[50],"information":[52,85],"gain":[53,86],"induced":[54],"model's":[57],"output":[58],"distribution.":[59],"To":[60],"overcome":[61],"intractability":[63],"answer-space":[65,94],"optimization,":[66],"we":[67],"introduce":[68],"latent":[70,89],"notion":[71],"helpfulness":[74],"and":[75,115],"theoretically":[76],"show":[77],"that,":[78],"under":[79],"mild":[80],"assumptions,":[81],"ranking":[82],"this":[88],"variable":[90],"equivalent":[92],"to":[93],"utility.":[95],"further":[97],"propose":[98],"training-free,":[100],"surrogate-accelerated":[101],"framework":[102],"that":[103,122],"efficiently":[104],"estimates":[105],"using":[108],"lightweight":[109],"models.":[111],"Experiments":[112],"MRAG-Bench":[114],"Visual-RAG":[116],"across":[117],"multiple":[118],"model":[119],"families":[120],"demonstrate":[121],"our":[123],"method":[124],"consistently":[125],"outperforms":[126],"state-of-the-art":[127],"RAG":[128],"baselines":[129],"while":[130],"achieving":[131],"substantial":[132],"reductions":[133],"in":[134],"computational":[135],"cost.":[136]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-15T00:00:00"}
