{"id":"https://openalex.org/W7128625439","doi":"https://doi.org/10.48550/arxiv.2602.09214","title":"VLM-UQBench: A Benchmark for Modality-Specific and Cross-Modality Uncertainties in Vision Language Models","display_name":"VLM-UQBench: A Benchmark for Modality-Specific and Cross-Modality Uncertainties in Vision Language Models","publication_year":2026,"publication_date":"2026-02-09","ids":{"openalex":"https://openalex.org/W7128625439","doi":"https://doi.org/10.48550/arxiv.2602.09214"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.09214","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125679234","display_name":"Chenyu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Chenyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125659028","display_name":"Tianle Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Tianle","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125624654","display_name":"H. M. Sabbir Ahmad","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ahmad, H. M. Sabbir","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100882415","display_name":"Kayhan N. Batmanghelich","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Batmanghelich, Kayhan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125654405","display_name":"Wenchao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Wenchao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5125679234"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9850999712944031,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9850999712944031,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.0015999999595806003,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0010999999940395355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/uncertainty-quantification","display_name":"Uncertainty quantification","score":0.7638999819755554},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.7354000210762024},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.57669997215271},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5687999725341797},{"id":"https://openalex.org/keywords/perturbation","display_name":"Perturbation (astronomy)","score":0.45660001039505005},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.42239999771118164},{"id":"https://openalex.org/keywords/measurement-uncertainty","display_name":"Measurement uncertainty","score":0.41370001435279846}],"concepts":[{"id":"https://openalex.org/C32230216","wikidata":"https://www.wikidata.org/wiki/Q7882499","display_name":"Uncertainty quantification","level":2,"score":0.7638999819755554},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.7354000210762024},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5965999960899353},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.57669997215271},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5687999725341797},{"id":"https://openalex.org/C177918212","wikidata":"https://www.wikidata.org/wiki/Q803623","display_name":"Perturbation (astronomy)","level":2,"score":0.45660001039505005},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.42239999771118164},{"id":"https://openalex.org/C137209882","wikidata":"https://www.wikidata.org/wiki/Q1403517","display_name":"Measurement uncertainty","level":2,"score":0.41370001435279846},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.38199999928474426},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3725000023841858},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35089999437332153},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3303000032901764},{"id":"https://openalex.org/C177803969","wikidata":"https://www.wikidata.org/wiki/Q29205","display_name":"Uncertainty analysis","level":2,"score":0.3131999969482422},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2955000102519989},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.27970001101493835},{"id":"https://openalex.org/C21200559","wikidata":"https://www.wikidata.org/wiki/Q7451068","display_name":"Sensitivity (control systems)","level":2,"score":0.26660001277923584},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.2655999958515167},{"id":"https://openalex.org/C12174686","wikidata":"https://www.wikidata.org/wiki/Q1058438","display_name":"Risk assessment","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25270000100135803}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.09214","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.09214","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.09214","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.09214","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Uncertainty":[0],"quantification":[1],"(UQ)":[2],"is":[3,18],"vital":[4],"for":[5,44,204],"ensuring":[6],"that":[7,92],"vision-language":[8],"models":[9],"(VLMs)":[10],"behave":[11],"safely":[12],"and":[13,46,68,72,82,102,107,120,135,156,160,198],"reliably.":[14],"A":[15],"central":[16],"challenge":[17],"to":[19,22,99,110,177],"localize":[20],"uncertainty":[21,49,70,144,202],"its":[23],"source,":[24],"determining":[25],"whether":[26],"it":[27],"arises":[28],"from":[29,59],"the":[30,32,37,60,94,139,179,199],"image,":[31],"text,":[33],"or":[34],"misalignment":[35],"between":[36,194],"two.":[38],"We":[39,86],"introduce":[40],"VLM-UQBench,":[41],"a":[42,73,112,191],"benchmark":[43],"modality-specific":[45,133,143],"cross-modal":[47,69,84],"data":[48],"in":[50],"VLMs,":[51],"It":[52],"consists":[53],"of":[54,96,114],"600":[55],"real-world":[56],"samples":[57],"drawn":[58],"VizWiz":[61],"dataset,":[62],"curated":[63],"into":[64],"clean,":[65],"image-,":[66],"text-,":[67],"subsets,":[71],"scalable":[74],"perturbation":[75,186],"pipeline":[76],"with":[77,105,147],"8":[78],"visual,":[79],"5":[80],"textual,":[81],"3":[83],"perturbations.":[85],"further":[87],"propose":[88],"two":[89],"simple":[90],"metrics":[91],"quantify":[93],"sensitivity":[95],"UQ":[97,115,129,151,163,196],"scores":[98,152],"these":[100],"perturbations":[101],"their":[103],"correlation":[104],"hallucinations,":[106],"use":[108],"them":[109],"evaluate":[111],"range":[113],"methods":[116,130,164],"across":[117],"four":[118],"VLMs":[119],"three":[121],"datasets.":[122],"Empirically,":[123],"we":[124],"find":[125],"that:":[126],"(i)":[127],"existing":[128],"exhibit":[131],"strong":[132],"specialization":[134],"substantial":[136],"dependence":[137],"on":[138,170],"underlying":[140],"VLM,":[141],"(ii)":[142],"frequently":[145],"co-occurs":[146],"hallucinations":[148],"while":[149],"current":[150,195],"provide":[153],"only":[154],"weak":[155],"inconsistent":[157],"risk":[158],"signals,":[159],"(iii)":[161],"although":[162],"can":[165],"rival":[166],"reasoning-based":[167],"chain-of-thought":[168],"baselines":[169],"overt,":[171],"group-level":[172],"ambiguity,":[173],"they":[174],"largely":[175],"fail":[176],"detect":[178],"subtle,":[180],"instance-level":[181],"ambiguity":[182],"introduced":[183],"by":[184],"our":[185],"pipeline.":[187],"These":[188],"results":[189],"highlight":[190],"significant":[192],"gap":[193],"practices":[197],"fine-grained,":[200],"modality-aware":[201],"required":[203],"reliable":[205],"VLM":[206],"deployment.":[207]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-12T00:00:00"}
