{"id":"https://openalex.org/W7153256287","doi":"https://doi.org/10.48550/arxiv.2604.08340","title":"PokeGym: A Visually-Driven Long-Horizon Benchmark for Vision-Language Models","display_name":"PokeGym: A Visually-Driven Long-Horizon Benchmark for Vision-Language Models","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7153256287","doi":"https://doi.org/10.48550/arxiv.2604.08340"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.08340","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08340","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.08340","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128159723","display_name":"Ruizhi Zhang (752030)","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Ruizhi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110374454","display_name":"Ye Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Ye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020554727","display_name":"Yuangang Pan","orcid":"https://orcid.org/0000-0002-7950-4900"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Yuangang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008722385","display_name":"Chuanfu Shen","orcid":"https://orcid.org/0000-0001-8782-5950"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Chuanfu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133351295","display_name":"Zhilin Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhilin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133351031","display_name":"Ting Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Ting","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133377502","display_name":"Wen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Wen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129683431","display_name":"Lixin Duan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Duan, Lixin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5128159723"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9891999959945679,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9891999959945679,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.0010999999940395355,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.0010000000474974513,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6680999994277954},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6266999840736389},{"id":"https://openalex.org/keywords/intuition","display_name":"Intuition","score":0.46540001034736633},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4381999969482422},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4374000132083893},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.423799991607666},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.37450000643730164},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.36579999327659607},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3481000065803528}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7980999946594238},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6680999994277954},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6266999840736389},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5325999855995178},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5076000094413757},{"id":"https://openalex.org/C132010649","wikidata":"https://www.wikidata.org/wiki/Q189222","display_name":"Intuition","level":2,"score":0.46540001034736633},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4381999969482422},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4374000132083893},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.423799991607666},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.37450000643730164},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3684999942779541},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.36579999327659607},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3481000065803528},{"id":"https://openalex.org/C159023740","wikidata":"https://www.wikidata.org/wiki/Q623276","display_name":"Deadlock","level":2,"score":0.3443000018596649},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.3416999876499176},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.3343000113964081},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3163999915122986},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.3077000081539154},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.28369998931884766},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C168065819","wikidata":"https://www.wikidata.org/wiki/Q845566","display_name":"Debugging","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.27219998836517334},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2535000145435333}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.08340","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08340","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.08340","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08340","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.773398756980896,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"Vision-Language":[1],"Models":[2],"(VLMs)":[3],"have":[4],"achieved":[5],"remarkable":[6],"progress":[7],"in":[8,14],"static":[9],"visual":[10,51,133],"understanding,":[11],"their":[12],"deployment":[13],"complex":[15,76],"3D":[16,77],"embodied":[17],"environments":[18,39],"remains":[19],"severely":[20],"limited.":[21],"Existing":[22],"benchmarks":[23],"suffer":[24,180],"from":[25,181],"four":[26],"critical":[27],"deficiencies:":[28],"(1)":[29],"passive":[30],"perception":[31],"tasks":[32,114],"circumvent":[33],"interactive":[34],"dynamics;":[35],"(2)":[36],"simplified":[37],"2D":[38],"fail":[40],"to":[41,130,185,197,204],"assess":[42],"depth":[43],"perception;":[44],"(3)":[45],"privileged":[46],"state":[47],"leakage":[48],"bypasses":[49],"genuine":[50],"processing;":[52],"and":[53,60,106,120,137,214],"(4)":[54],"human":[55],"evaluation":[56,142],"is":[57],"prohibitively":[58],"expensive":[59],"unscalable.":[61],"We":[62],"introduce":[63],"PokeGym,":[64],"a":[65,74,144,164,174],"visually-driven":[66],"long-horizon":[67],"benchmark":[68,111,215],"instantiated":[69],"within":[70],"Pokemon":[71],"Legends:":[72],"Z-A,":[73],"visually":[75],"open-world":[78],"Role-Playing":[79],"Game.":[80],"PokeGym":[81],"enforces":[82],"strict":[83],"code-level":[84],"isolation:":[85],"agents":[86],"operate":[87],"solely":[88],"on":[89,219],"raw":[90],"RGB":[91],"observations":[92],"while":[93],"an":[94],"independent":[95],"evaluator":[96],"verifies":[97],"success":[98],"via":[99],"memory":[100],"scanning,":[101],"ensuring":[102],"pure":[103],"vision-based":[104],"decision-making":[105],"automated,":[107],"scalable":[108],"assessment.":[109],"The":[110,212],"comprises":[112],"30":[113],"(30-220":[115],"steps)":[116],"spanning":[117],"navigation,":[118],"interaction,":[119],"mixed":[121],"scenarios,":[122],"with":[123,161,168],"three":[124],"instruction":[125],"granularities":[126],"(Visual-Guided,":[127],"Step-Guided,":[128],"Goal-Only)":[129],"systematically":[131],"deconstruct":[132],"grounding,":[134],"semantic":[135],"reasoning,":[136],"autonomous":[138],"exploration":[139],"capabilities.":[140],"Our":[141],"reveals":[143],"key":[145],"limitation":[146],"of":[147],"current":[148],"VLMs:":[149],"physical":[150],"deadlock":[151],"recovery,":[152],"rather":[153],"than":[154],"high-level":[155],"planning,":[156],"constitutes":[157],"the":[158,202],"primary":[159],"bottleneck,":[160],"deadlocks":[162],"showing":[163],"strong":[165],"negative":[166],"correlation":[167],"task":[169],"success.":[170],"Furthermore,":[171],"we":[172],"uncover":[173],"metacognitive":[175],"divergence:":[176],"weaker":[177],"models":[178,189],"predominantly":[179],"Unaware":[182],"Deadlocks":[183,192],"(oblivious":[184],"entrapment),":[186],"whereas":[187],"advanced":[188],"exhibit":[190],"Aware":[191],"(recognizing":[193],"entrapment":[194],"yet":[195],"failing":[196],"recover).":[198],"These":[199],"findings":[200],"highlight":[201],"need":[203],"integrate":[205],"explicit":[206],"spatial":[207],"intuition":[208],"into":[209],"VLM":[210],"architectures.":[211],"code":[213],"will":[216],"be":[217],"available":[218],"GitHub.":[220]},"counts_by_year":[],"updated_date":"2026-04-11T06:19:08.300824","created_date":"2026-04-11T00:00:00"}
