{"id":"https://openalex.org/W4415709030","doi":"https://doi.org/10.1109/icme59968.2025.11208962","title":"IllusionBench: A Large-scale and Comprehensive Benchmark for Visual Illusion Understanding in Vision-Language Models","display_name":"IllusionBench: A Large-scale and Comprehensive Benchmark for Visual Illusion Understanding in Vision-Language Models","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415709030","doi":"https://doi.org/10.1109/icme59968.2025.11208962"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11208962","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11208962","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100395351","display_name":"Yiming Zhang","orcid":"https://orcid.org/0000-0001-6450-8485"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yiming Zhang","raw_affiliation_strings":["Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110641806","display_name":"Zicheng Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zicheng Zhang","raw_affiliation_strings":["Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015823480","display_name":"Xinyi Wei","orcid":"https://orcid.org/0000-0001-8318-6119"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinyi Wei","raw_affiliation_strings":["Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062521285","display_name":"Xiaohong Liu","orcid":"https://orcid.org/0000-0001-9413-2706"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaohong Liu","raw_affiliation_strings":["Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Guangtao Zhai","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangtao Zhai","raw_affiliation_strings":["Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":null,"display_name":"Xiongkuo Min","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiongkuo Min","raw_affiliation_strings":["Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,Institute of Image Communication and Network Engineering,China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100395351"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31220483,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6877999901771545,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6877999901771545,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.049400001764297485,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.018699999898672104,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/illusion","display_name":"Illusion","score":0.8554999828338623},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5823000073432922},{"id":"https://openalex.org/keywords/visual-hallucination","display_name":"Visual Hallucination","score":0.5594000220298767},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5473999977111816},{"id":"https://openalex.org/keywords/optical-illusion","display_name":"Optical illusion","score":0.5145999789237976},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5012999773025513},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.45509999990463257},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.4138000011444092}],"concepts":[{"id":"https://openalex.org/C184047640","wikidata":"https://www.wikidata.org/wiki/Q182593","display_name":"Illusion","level":2,"score":0.8554999828338623},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6305000185966492},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6055999994277954},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5823000073432922},{"id":"https://openalex.org/C2908998935","wikidata":"https://www.wikidata.org/wiki/Q130741","display_name":"Visual Hallucination","level":2,"score":0.5594000220298767},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5473999977111816},{"id":"https://openalex.org/C139793654","wikidata":"https://www.wikidata.org/wiki/Q174923","display_name":"Optical illusion","level":3,"score":0.5145999789237976},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5012999773025513},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.45509999990463257},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.41429999470710754},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.4138000011444092},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4052000045776367},{"id":"https://openalex.org/C2778251979","wikidata":"https://www.wikidata.org/wiki/Q7936617","display_name":"Visual processing","level":3,"score":0.3955000042915344},{"id":"https://openalex.org/C17305859","wikidata":"https://www.wikidata.org/wiki/Q382944","display_name":"Soar","level":2,"score":0.36329999566078186},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.35989999771118164},{"id":"https://openalex.org/C2779200073","wikidata":"https://www.wikidata.org/wiki/Q18395575","display_name":"Visual masking","level":4,"score":0.31189998984336853},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C2776817677","wikidata":"https://www.wikidata.org/wiki/Q4839818","display_name":"Backward masking","level":3,"score":0.2863999903202057},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.27950000762939453},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2793000042438507},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2669000029563904},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.2630000114440918}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11208962","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11208962","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W1540682689","https://openalex.org/W2031342017","https://openalex.org/W2059243251","https://openalex.org/W2280337858","https://openalex.org/W2793696519","https://openalex.org/W2901893099","https://openalex.org/W3083092310","https://openalex.org/W3103072285","https://openalex.org/W4249587718","https://openalex.org/W4285387508","https://openalex.org/W4389518675","https://openalex.org/W4392172801","https://openalex.org/W4402716381","https://openalex.org/W4402726948","https://openalex.org/W4402727669","https://openalex.org/W4404356490"],"related_works":[],"abstract_inverted_index":{"Current":[0],"Visual":[1],"Language":[2],"Models":[3],"(VLMs)":[4],"show":[5],"impressive":[6],"image":[7],"understanding":[8],"but":[9,61,116,141],"struggle":[10],"with":[11],"visual":[12,51,185],"illusions,":[13,24,107,163],"especially":[14],"in":[15,118,123,158,187],"real-world":[16,63,106],"scenarios.":[17],"Existing":[18],"benchmarks":[19],"focus":[20],"on":[21,94,133,138,154],"classical":[22,114,155],"cognitive":[23,59],"which":[25],"have":[26],"been":[27],"learned":[28],"by":[29],"state-of-the-art":[30],"(SOTA)":[31],"VLMs,":[32],"revealing":[33],"issues":[34,122],"such":[35],"as":[36],"hallucinations":[37,153],"and":[38,74,84,100,136,180],"limited":[39],"perceptual":[40],"abilities.":[41],"To":[42],"address":[43,80],"this":[44,95],"gap,":[45],"we":[46,108],"introduce":[47],"IllusionBench,":[48],"a":[49],"comprehensive":[50,182],"illusion":[52],"dataset":[53,67,96],"that":[54,79,112],"encompasses":[55],"not":[56],"only":[57],"classic":[58],"illusions":[60,111,156,186],"also":[62],"scene":[64],"illusions.":[65,88],"This":[66],"features":[68],"1,051":[69,75],"images,":[70],"5,548":[71],"question-answer":[72],"pairs,":[73],"golden":[76],"text":[77],"descriptions":[78],"the":[81,87,148,173,178],"presence,":[82],"causes,":[83],"content":[85],"of":[86,175],"We":[89],"evaluate":[90],"ten":[91],"SOTA":[92,124],"VLMs":[93,188],"using":[97],"true-or-false,":[98],"multiple-choice,":[99],"open-ended":[101],"tasks.":[102],"In":[103,147],"addition":[104],"to":[105,172,189],"design":[109],"trap":[110,162],"resemble":[113],"patterns":[115],"differ":[117],"reality,":[119],"highlighting":[120],"hallucination":[121],"models.":[125,169],"The":[126],"top-performing":[127],"model,":[128],"GPT-4o,":[129],"achieves":[130],"80.59%":[131],"accuracy":[132],"true-or-false":[134],"tasks":[135],"76.75%":[137],"multiple-choice":[139],"questions,":[140],"still":[142],"lags":[143],"behind":[144,166],"human":[145],"performance.":[146],"semantic":[149],"description":[150],"task,":[151],"GPT-4o\u2019s":[152],"result":[157],"low":[159],"scores":[160],"for":[161,184],"even":[164],"falling":[165],"some":[167],"open-source":[168],"IllusionBench":[170],"is,":[171],"best":[174],"our":[176],"knowledge,":[177],"largest":[179],"most":[181],"benchmark":[183],"date.":[190]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-30T00:00:00"}
