{"id":"https://openalex.org/W7163071583","doi":"https://doi.org/10.48550/arxiv.2605.31148","title":"SpatialAct: Probing Spatial Reasoning-to-Action Capabilities of VLM Agents in 3D Scenes","display_name":"SpatialAct: Probing Spatial Reasoning-to-Action Capabilities of VLM Agents in 3D Scenes","publication_year":2026,"publication_date":"2026-05-29","ids":{"openalex":"https://openalex.org/W7163071583","doi":"https://doi.org/10.48550/arxiv.2605.31148"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.31148","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.31148","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.31148","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137572274","display_name":"Tianhui Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Tianhui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137524769","display_name":"Jie Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Jie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137514089","display_name":"Zhiheng Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Zhiheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137563208","display_name":"Shengyuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shengyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054633251","display_name":"Yiming Guo","orcid":"https://orcid.org/0000-0002-2634-5651"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Yiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067943001","display_name":"Yanxin Xi","orcid":"https://orcid.org/0000-0003-4715-2186"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xi, Yanxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137597217","display_name":"Hangyu Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Hangyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137597249","display_name":"Yong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137528527","display_name":"Pan Hui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hui, Pan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8252999782562256,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8252999782562256,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.03319999948143959,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11431","display_name":"Action Observation and Synchronization","score":0.032499998807907104,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.6908000111579895},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.6442000269889832},{"id":"https://openalex.org/keywords/spatial-cognition","display_name":"Spatial cognition","score":0.6189000010490417},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.4555000066757202},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.43230000138282776},{"id":"https://openalex.org/keywords/spatial-relation","display_name":"Spatial relation","score":0.4302000105381012},{"id":"https://openalex.org/keywords/spatial-contextual-awareness","display_name":"Spatial contextual awareness","score":0.42820000648498535},{"id":"https://openalex.org/keywords/spatial-ability","display_name":"Spatial ability","score":0.4219000041484833}],"concepts":[{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.6908000111579895},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6654000282287598},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.6442000269889832},{"id":"https://openalex.org/C2777371692","wikidata":"https://www.wikidata.org/wiki/Q2178611","display_name":"Spatial cognition","level":3,"score":0.6189000010490417},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5687999725341797},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.4555000066757202},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.43230000138282776},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.4302000105381012},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.42820000648498535},{"id":"https://openalex.org/C2778662690","wikidata":"https://www.wikidata.org/wiki/Q3125339","display_name":"Spatial ability","level":3,"score":0.4219000041484833},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.42149999737739563},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.4207000136375427},{"id":"https://openalex.org/C170494330","wikidata":"https://www.wikidata.org/wiki/Q1778434","display_name":"Cognitive map","level":3,"score":0.3758000135421753},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.36500000953674316},{"id":"https://openalex.org/C2776010242","wikidata":"https://www.wikidata.org/wiki/Q4677575","display_name":"Active perception","level":3,"score":0.3488999903202057},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3375000059604645},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.3287000060081482},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.3285999894142151},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3154999911785126},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.30570000410079956},{"id":"https://openalex.org/C158709400","wikidata":"https://www.wikidata.org/wiki/Q3578586","display_name":"Spatial ecology","level":2,"score":0.30140000581741333},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.28459998965263367},{"id":"https://openalex.org/C2986522900","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relationship","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C2781179811","wikidata":"https://www.wikidata.org/wiki/Q28134081","display_name":"Spatial configuration","level":3,"score":0.26429998874664307},{"id":"https://openalex.org/C43729271","wikidata":"https://www.wikidata.org/wiki/Q3560550","display_name":"Spatial memory","level":4,"score":0.26030001044273376}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.31148","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.31148","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.31148","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.31148","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Humans":[0],"can":[1,44,120],"effortlessly":[2],"perceive":[3],"spatial":[4,11,34,47,72,101,125,133,155],"layouts,":[5],"form":[6],"cognitive":[7],"representations,":[8],"reason":[9],"about":[10],"relations,":[12],"and":[13,36,52,95,135],"translate":[14],"such":[15],"reasoning":[16,37,126],"into":[17],"actions":[18,55,138],"in":[19,74],"everyday":[20],"3D":[21,75],"environments.":[22],"Although":[23],"recent":[24],"vision-language":[25],"models":[26],"(VLMs)":[27],"have":[28],"shown":[29],"promising":[30],"performance":[31],"on":[32,123],"observation-conditioned":[33],"perception":[35],"tasks,":[38,127],"it":[39],"remains":[40],"unclear":[41],"whether":[42],"they":[43],"build":[45],"coherent":[46,132],"understanding,":[48],"act":[49],"upon":[50],"it,":[51],"refine":[53],"their":[54],"through":[56],"multi-turn":[57,140],"feedback.":[58],"To":[59],"study":[60],"this":[61],"problem,":[62],"we":[63,86],"introduce":[64],"\\textbf{SpatialAct},":[65],"a":[66,114],"simulator-grounded":[67],"benchmark":[68],"for":[69],"probing":[70],"\\textit{action-conditioned":[71],"reasoning}":[73],"scenes.":[76],"Starting":[77],"from":[78],"the":[79,106],"most":[80],"challenging":[81],"setting,":[82],"Multi-turn":[83],"Interactive":[84],"Refinement,":[85],"further":[87],"design":[88],"its":[89],"decomposed":[90],"counterpart,":[91],"Single-step":[92],"Error":[93],"Detection":[94],"Fix,":[96],"together":[97],"with":[98],"five":[99],"fundamental":[100],"ability":[102],"tasks":[103],"to":[104,130],"diagnose":[105],"underlying":[107],"causes":[108],"of":[109],"model":[110],"failures.":[111],"Experiments":[112],"reveal":[113],"clear":[115],"reasoning-to-action":[116],"gap:":[117],"current":[118,149],"VLMs":[119],"perform":[121],"well":[122],"isolated":[124],"but":[128],"struggle":[129],"maintain":[131],"beliefs":[134],"produce":[136],"reliable":[137],"during":[139],"feedback,":[141],"substantially":[142],"underperforming":[143],"humans.":[144],"These":[145],"results":[146],"suggest":[147],"that":[148],"VLM":[150],"agents":[151],"still":[152],"lack":[153],"robust":[154],"state":[156],"tracking":[157],"under":[158],"action-induced":[159],"environment":[160],"changes,":[161],"even":[162],"when":[163],"low-level":[164],"control":[165],"is":[166],"abstracted":[167],"away.":[168]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-06-02T00:00:00"}
