{"id":"https://openalex.org/W7137158010","doi":"https://doi.org/10.48550/arxiv.2603.12639","title":"RoboStereo: Dual-Tower 4D Embodied World Models for Unified Policy Optimization","display_name":"RoboStereo: Dual-Tower 4D Embodied World Models for Unified Policy Optimization","publication_year":2026,"publication_date":"2026-03-13","ids":{"openalex":"https://openalex.org/W7137158010","doi":"https://doi.org/10.48550/arxiv.2603.12639"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.12639","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12639","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.12639","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129508296","display_name":"Ruicheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Ruicheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129504759","display_name":"Guangyu Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Guangyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113145225","display_name":"Zunnan Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zunnan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129614638","display_name":"Zihao Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129517119","display_name":"Zhizhou Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Zhizhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129465280","display_name":"Mingyang Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Mingyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129606171","display_name":"Jun Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Jun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129494722","display_name":"Xiu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xiu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5129508296"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.28929999470710754,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.28929999470710754,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.2849999964237213,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.09610000252723694,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.7537000179290771},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.6717000007629395},{"id":"https://openalex.org/keywords/policy-learning","display_name":"Policy learning","score":0.45159998536109924},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4332999885082245},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.40700000524520874},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.35679998993873596}],"concepts":[{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.7537000179290771},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.6717000007629395},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6262000203132629},{"id":"https://openalex.org/C2779436431","wikidata":"https://www.wikidata.org/wiki/Q30672407","display_name":"Policy learning","level":2,"score":0.45159998536109924},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4332999885082245},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.40700000524520874},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39719998836517334},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.35679998993873596},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.32679998874664307},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.31709998846054077},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.3091000020503998},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.30399999022483826},{"id":"https://openalex.org/C119693030","wikidata":"https://www.wikidata.org/wiki/Q2180497","display_name":"Economic model","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C189474733","wikidata":"https://www.wikidata.org/wiki/Q917912","display_name":"Model building","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.12639","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12639","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.12639","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12639","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Scalable":[0],"Embodied":[1,17],"AI":[2],"faces":[3],"fundamental":[4],"constraints":[5],"due":[6],"to":[7,55,97],"prohibitive":[8],"costs":[9],"and":[10,32,60,102,112],"safety":[11],"risks":[12],"of":[13],"real-world":[14],"interaction.":[15],"While":[16],"World":[18],"Models":[19],"(EWMs)":[20],"offer":[21],"promise":[22],"through":[23],"imagined":[24],"rollouts,":[25],"existing":[26],"approaches":[27],"suffer":[28],"from":[29,99],"geometric":[30,58],"hallucinations":[31],"lack":[33],"unified":[34,74,124],"optimization":[35],"frameworks":[36],"for":[37,76,85],"practical":[38],"policy":[39,78],"improvement.":[40],"We":[41],"introduce":[42],"RoboStereo,":[43],"a":[44],"symmetric":[45],"dual-tower":[46],"4D":[47,68],"world":[48],"model":[49],"that":[50],"employs":[51],"bidirectional":[52],"cross-modal":[53],"enhancement":[54],"ensure":[56],"spatiotemporal":[57],"consistency":[59],"alleviate":[61],"physics":[62],"hallucinations.":[63],"Building":[64],"upon":[65],"this":[66],"high-fidelity":[67],"simulator,":[69],"we":[70],"present":[71],"the":[72],"first":[73],"framework":[75,125],"world-model-based":[77],"optimization:":[79],"(1)":[80],"Test-Time":[81],"Policy":[82,90,105],"Augmentation":[83],"(TTPA)":[84],"pre-execution":[86],"verification,":[87],"(2)":[88],"Imitative-Evolutionary":[89],"Learning":[91,106],"(IEPL)":[92],"leveraging":[93],"visual":[94],"perceptual":[95],"rewards":[96],"learn":[98],"expert":[100],"demonstrations,":[101],"(3)":[103],"Open-Exploration":[104],"(OEPL)":[107],"enabling":[108],"autonomous":[109],"skill":[110],"discovery":[111],"self-correction.":[113],"Comprehensive":[114],"experiments":[115],"demonstrate":[116],"RoboStereo":[117],"achieves":[118],"state-of-the-art":[119],"generation":[120],"quality,":[121],"with":[122],"our":[123],"delivering":[126],"&gt;97%":[127],"average":[128],"relative":[129],"improvement":[130],"on":[131],"fine-grained":[132],"manipulation":[133],"tasks.":[134]},"counts_by_year":[],"updated_date":"2026-03-17T07:05:13.627479","created_date":"2026-03-17T00:00:00"}
