{"id":"https://openalex.org/W7161555570","doi":"https://doi.org/10.48550/arxiv.2605.15864","title":"Are VLMs Seeing or Just Saying? Uncovering the Illusion of Visual Re-examination","display_name":"Are VLMs Seeing or Just Saying? Uncovering the Illusion of Visual Re-examination","publication_year":2026,"publication_date":"2026-05-15","ids":{"openalex":"https://openalex.org/W7161555570","doi":"https://doi.org/10.48550/arxiv.2605.15864"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.15864","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.15864","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.15864","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136447407","display_name":"Chufan Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Chufan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136403290","display_name":"Cheng Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125478434","display_name":"Yaokang Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yaokang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109684428","display_name":"Linhao Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Linghao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125479028","display_name":"Bo Shui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shui, Bo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136404447","display_name":"Taylor Berg-Kirkpatrick","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Berg-Kirkpatrick, Taylor","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136394915","display_name":"Xuezhe Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Xuezhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8973000049591064,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8973000049591064,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.01850000023841858,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.008899999782443047,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/illusion","display_name":"Illusion","score":0.5444999933242798},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4456000030040741},{"id":"https://openalex.org/keywords/test","display_name":"Test (biology)","score":0.43790000677108765},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.36980000138282776},{"id":"https://openalex.org/keywords/optical-illusion","display_name":"Optical illusion","score":0.3691999912261963},{"id":"https://openalex.org/keywords/visual-rhetoric","display_name":"Visual rhetoric","score":0.3677000105381012},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.33649998903274536}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5914999842643738},{"id":"https://openalex.org/C184047640","wikidata":"https://www.wikidata.org/wiki/Q182593","display_name":"Illusion","level":2,"score":0.5444999933242798},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.5293999910354614},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4456000030040741},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.43790000677108765},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43560001254081726},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.4203000068664551},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.36980000138282776},{"id":"https://openalex.org/C139793654","wikidata":"https://www.wikidata.org/wiki/Q174923","display_name":"Optical illusion","level":3,"score":0.3691999912261963},{"id":"https://openalex.org/C63075964","wikidata":"https://www.wikidata.org/wiki/Q3277307","display_name":"Visual rhetoric","level":3,"score":0.3677000105381012},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.33649998903274536},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3237000107765198},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3142000138759613},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.30959999561309814},{"id":"https://openalex.org/C200220432","wikidata":"https://www.wikidata.org/wiki/Q7936208","display_name":"Vision science","level":2,"score":0.2939000129699707},{"id":"https://openalex.org/C158495155","wikidata":"https://www.wikidata.org/wiki/Q2369151","display_name":"Visual search","level":2,"score":0.29280000925064087},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.29190000891685486},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28200000524520874},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C105842133","wikidata":"https://www.wikidata.org/wiki/Q1899679","display_name":"Visual communication","level":2,"score":0.2782000005245209},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.272599995136261}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.15864","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.15864","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.15864","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.15864","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"(VLMs)":[2],"often":[3],"produce":[4],"self-reflective":[5],"statements":[6,18,124],"like":[7],"\"let":[8],"me":[9],"check":[10],"the":[11,60,89,168],"figure":[12],"again\"":[13],"during":[14,125],"reasoning.":[15],"Do":[16],"such":[17],"trigger":[19],"genuine":[20],"visual":[21,119,140,159],"re-examination,":[22],"or":[23],"are":[24,101,165],"they":[25],"merely":[26],"learned":[27],"textual":[28],"patterns?":[29],"We":[30,63],"investigate":[31],"this":[32],"via":[33],"VisualSwap,":[34],"an":[35,44],"image-swap":[36],"probing":[37],"framework:":[38],"after":[39],"a":[40,50,83],"model":[41,61],"reasons":[42],"over":[43],"image,":[45],"we":[46],"replace":[47],"it":[48],"with":[49,91],"visually":[51],"similar":[52],"but":[53,121],"semantically":[54],"different":[55],"one":[56],"and":[57,74,80,110,163],"test":[58],"whether":[59],"notices.":[62],"introduce":[64],"VS-Bench,":[65],"800":[66],"image":[67],"pairs":[68],"curated":[69],"from":[70],"MathVista,":[71],"MathVerse,":[72],"MathVision,":[73],"MMMU-Pro.":[75],"Experiments":[76],"on":[77],"Qwen3-VL,":[78],"Kimi-VL,":[79],"ERNIE-VL":[81],"reveal":[82],"striking":[84],"failure:":[85],"models":[86,100],"overwhelmingly":[87],"miss":[88],"swap,":[90],"accuracy":[92],"dropping":[93],"by":[94],"up":[95],"to":[96,139,149,157],"60%.":[97],"Counterintuitively,":[98],"thinking":[99],"nearly":[102],"3x":[103],"more":[104],"vulnerable":[105],"than":[106,152],"their":[107],"instructed":[108],"counterparts,":[109],"scaling":[111],"offers":[112],"no":[113],"mitigation.":[114],"Multi-turn":[115],"user":[116,134],"instructions":[117,135],"restore":[118],"grounding,":[120],"self-generated":[122],"reflective":[123],"continuous":[126],"generation":[127],"do":[128],"not.":[129,145],"Attention":[130],"analysis":[131],"explains":[132],"why:":[133],"substantially":[136],"elevate":[137],"attention":[138],"tokens,":[141],"whereas":[142],"self-reflection":[143],"does":[144],"Current":[146],"VLMs":[147],"tend":[148],"say":[150],"rather":[151],"actually":[153],"see":[154],"when":[155],"claiming":[156],"perform":[158],"re-examination.":[160],"Our":[161],"code":[162],"dataset":[164],"available":[166],"at":[167],"project":[169],"page:":[170],"https://visualswap.github.io":[171]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-19T00:00:00"}
