{"id":"https://openalex.org/W7136636792","doi":"https://doi.org/10.48550/arxiv.2603.12939","title":"RoboStream: Weaving Spatio-Temporal Reasoning with Memory in Vision-Language Models for Robotics","display_name":"RoboStream: Weaving Spatio-Temporal Reasoning with Memory in Vision-Language Models for Robotics","publication_year":2026,"publication_date":"2026-03-13","ids":{"openalex":"https://openalex.org/W7136636792","doi":"https://doi.org/10.48550/arxiv.2603.12939"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.12939","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12939","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.12939","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5051294371","display_name":"Yuzhi Huang","orcid":"https://orcid.org/0009-0008-3903-7667"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Huang, Yuzhi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129539803","display_name":"Jie Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Jie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129440533","display_name":"Weijue Bu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bu, Weijue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129559290","display_name":"Ziyi Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Ziyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065163866","display_name":"Gaoyang Jiang","orcid":"https://orcid.org/0000-0002-8478-0619"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Gaoyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129443640","display_name":"Ye Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Ye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123514399","display_name":"Kangye Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Kangye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062107989","display_name":"Shuzhao Xie","orcid":"https://orcid.org/0009-0008-3017-1077"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Shuzhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129504640","display_name":"Yue Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039066582","display_name":"Chenglei Wu","orcid":"https://orcid.org/0000-0002-7307-9480"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Chenglei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129626829","display_name":"Jingyan Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Jingyan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129608014","display_name":"Zhi Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5051294371"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6373000144958496,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6373000144958496,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.10509999841451645,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.05530000105500221,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.6829000115394592},{"id":"https://openalex.org/keywords/causal-reasoning","display_name":"Causal reasoning","score":0.5726000070571899},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.5015000104904175},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.47279998660087585},{"id":"https://openalex.org/keywords/robotics","display_name":"Robotics","score":0.459199994802475},{"id":"https://openalex.org/keywords/commonsense-reasoning","display_name":"Commonsense reasoning","score":0.436599999666214},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4228000044822693},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.42089998722076416},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.3797000050544739},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.3425999879837036},{"id":"https://openalex.org/keywords/trace","display_name":"TRACE (psycholinguistics)","score":0.3393000066280365}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6898999810218811},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.6829000115394592},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6717000007629395},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.5726000070571899},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.5015000104904175},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.47279998660087585},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.459199994802475},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.436599999666214},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4228000044822693},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.42089998722076416},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3797000050544739},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3785000145435333},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.3458000123500824},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.34279999136924744},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.3425999879837036},{"id":"https://openalex.org/C75291252","wikidata":"https://www.wikidata.org/wiki/Q1315756","display_name":"TRACE (psycholinguistics)","level":2,"score":0.3393000066280365},{"id":"https://openalex.org/C54525549","wikidata":"https://www.wikidata.org/wiki/Q2553445","display_name":"Weaving","level":2,"score":0.3359000086784363},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.3212999999523163},{"id":"https://openalex.org/C192327766","wikidata":"https://www.wikidata.org/wiki/Q1038799","display_name":"Cognitive robotics","level":3,"score":0.31299999356269836},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.31139999628067017},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.30970001220703125},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.3073999881744385},{"id":"https://openalex.org/C163504300","wikidata":"https://www.wikidata.org/wiki/Q2364925","display_name":"Causal structure","level":2,"score":0.30410000681877136},{"id":"https://openalex.org/C22304111","wikidata":"https://www.wikidata.org/wiki/Q1417978","display_name":"Object permanence","level":4,"score":0.30169999599456787},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.29750001430511475},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.2946000099182129},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.2937000095844269},{"id":"https://openalex.org/C2776999362","wikidata":"https://www.wikidata.org/wiki/Q2349274","display_name":"Planner","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C79897977","wikidata":"https://www.wikidata.org/wiki/Q5054568","display_name":"Causal chain","level":2,"score":0.28940001130104065},{"id":"https://openalex.org/C86827895","wikidata":"https://www.wikidata.org/wiki/Q7098582","display_name":"Opportunistic reasoning","level":4,"score":0.28529998660087585},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C2982912361","wikidata":"https://www.wikidata.org/wiki/Q1851867","display_name":"Mental model","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C96199812","wikidata":"https://www.wikidata.org/wiki/Q2145290","display_name":"Mental representation","level":3,"score":0.26739999651908875},{"id":"https://openalex.org/C88576662","wikidata":"https://www.wikidata.org/wiki/Q18646","display_name":"Episodic memory","level":3,"score":0.267300009727478},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.2599000036716461}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.12939","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12939","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.12939","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12939","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.7065406441688538}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Enabling":[0],"reliable":[1,232],"long-horizon":[2,204,233],"robotic":[3],"manipulation":[4],"is":[5],"a":[6,103,139,168],"crucial":[7],"step":[8,18],"toward":[9],"open-world":[10],"embodied":[11],"intelligence.":[12],"However,":[13],"VLM-based":[14],"planners":[15],"treat":[16],"each":[17,122],"as":[19],"an":[20],"isolated":[21],"observation-to-action":[22],"mapping,":[23],"forcing":[24],"them":[25,120],"to":[26,91,155,185],"reinfer":[27],"scene":[28],"geometry":[29],"from":[30],"raw":[31],"pixels":[32],"at":[33,121],"every":[34],"decision":[35],"point":[36],"while":[37],"remaining":[38],"unaware":[39],"of":[40,65],"how":[41],"prior":[42],"actions":[43],"have":[44],"reshaped":[45],"the":[46,55,77,183],"environment.":[47],"Despite":[48],"strong":[49],"short-horizon":[50],"performance,":[51],"these":[52,87],"systems":[53],"lack":[54],"spatio-temporal":[56,131,222],"reasoning":[57,132,223],"required":[58],"for":[59,129,159,231],"persistent":[60,70,104,134,160],"geometric":[61,144,157],"anchoring":[62,145],"and":[63,86,112,163,189,206,216,224],"memory":[64,226],"action-triggered":[66,175],"state":[67,71,176],"transitions.":[68],"Without":[69],"tracking,":[72],"perceptual":[73],"errors":[74],"accumulate":[75],"across":[76,115,178],"execution":[78],"horizon,":[79],"temporarily":[80],"occluded":[81],"objects":[82],"are":[83,227],"catastrophically":[84],"forgotten,":[85],"compounding":[88],"failures":[89],"lead":[90],"precondition":[92],"violations":[93],"that":[94,107,142,173,221],"cascade":[95],"through":[96,146],"subsequent":[97],"steps.":[98,179],"In":[99],"contrast,":[100],"humans":[101],"maintain":[102],"mental":[105],"model":[106],"continuously":[108],"tracks":[109],"spatial":[110],"relations":[111],"action":[113],"consequences":[114],"interactions":[116],"rather":[117],"than":[118],"reconstructing":[119],"instant.":[123],"Inspired":[124],"by":[125],"this":[126],"human":[127],"capacity":[128],"causal":[130,165,187,225],"with":[133],"memory,":[135],"we":[136],"propose":[137],"RoboStream,":[138],"training-free":[140],"framework":[141],"achieves":[143,201],"Spatio-Temporal":[147,170],"Fusion":[148],"Tokens":[149],"(STF-Tokens),":[150],"which":[151],"bind":[152],"visual":[153],"evidence":[154],"3D":[156],"attributes":[158],"object":[161,191],"grounding,":[162],"maintains":[164],"continuity":[166],"via":[167],"Causal":[169],"Graph":[171],"(CSTG)":[172],"records":[174],"transitions":[177],"This":[180],"design":[181],"enables":[182],"planner":[184],"trace":[186],"chains":[188],"preserve":[190],"permanence":[192],"under":[193],"occlusion":[194],"without":[195],"additional":[196],"training":[197],"or":[198],"fine-tuning.":[199],"RoboStream":[200],"90.5%":[202],"on":[203,208],"RLBench":[205],"44.4%":[207],"challenging":[209],"real-world":[210],"block-building":[211],"tasks,":[212],"where":[213],"both":[214],"SoFar":[215],"VoxPoser":[217],"score":[218],"11.1%,":[219],"demonstrating":[220],"critical":[228],"missing":[229],"components":[230],"manipulation.":[234]},"counts_by_year":[],"updated_date":"2026-03-17T07:05:13.627479","created_date":"2026-03-17T00:00:00"}
