{"id":"https://openalex.org/W7158872090","doi":"https://doi.org/10.48550/arxiv.2604.26934","title":"World2VLM: Distilling World Model Imagination into VLMs for Dynamic Spatial Reasoning","display_name":"World2VLM: Distilling World Model Imagination into VLMs for Dynamic Spatial Reasoning","publication_year":2026,"publication_date":"2026-04-29","ids":{"openalex":"https://openalex.org/W7158872090","doi":"https://doi.org/10.48550/arxiv.2604.26934"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.26934","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26934","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.26934","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134880813","display_name":"Wanyue Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Wanyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134882335","display_name":"Wenxiang Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Wenxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134910286","display_name":"Wang Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Wang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013208204","display_name":"Jiaxin Luo","orcid":"https://orcid.org/0000-0001-9821-8371"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Jiaxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120749136","display_name":"Helu Zhi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhi, Helu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134911285","display_name":"Yibin Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yibin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134905222","display_name":"Shuo Ren","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Shuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014957271","display_name":"Zitao Liu","orcid":"https://orcid.org/0000-0003-0491-307X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zitao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134904290","display_name":"Jiajun Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jiajun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5134880813"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8155999779701233,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8155999779701233,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.01899999938905239,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.018799999728798866,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6477000117301941},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.6208999752998352},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5684000253677368},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5453000068664551},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4426000118255615},{"id":"https://openalex.org/keywords/parameterized-complexity","display_name":"Parameterized complexity","score":0.4357999861240387},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.3959999978542328},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.3765000104904175}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7017999887466431},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6477000117301941},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.6208999752998352},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5684000253677368},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5526000261306763},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5453000068664551},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4426000118255615},{"id":"https://openalex.org/C165464430","wikidata":"https://www.wikidata.org/wiki/Q1570441","display_name":"Parameterized complexity","level":2,"score":0.4357999861240387},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3959999978542328},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3765000104904175},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.3749000132083893},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.33399999141693115},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.32899999618530273},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3138999938964844},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.31040000915527344},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.2669999897480011},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2612000107765198},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.25920000672340393},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2583000063896179},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.25440001487731934}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.26934","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26934","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.26934","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26934","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-language":[0],"models":[1,47,186],"(VLMs)":[2],"have":[3],"shown":[4],"strong":[5],"performance":[6],"on":[7,133,144],"static":[8],"visual":[9],"understanding,":[10],"yet":[11],"they":[12],"still":[13],"struggle":[14],"with":[15,38,45,129],"dynamic":[16],"spatial":[17,36,80,123,146,204],"reasoning":[18,147],"that":[19,78,184],"requires":[20],"imagining":[21],"how":[22],"scenes":[23],"evolve":[24],"under":[25],"egocentric":[26],"motion.":[27],"Recent":[28],"efforts":[29],"address":[30],"this":[31,70,139],"limitation":[32],"either":[33],"by":[34,42,138],"scaling":[35],"supervision":[37,115],"synthetic":[39],"data":[40],"or":[41],"coupling":[43],"VLMs":[44,201],"world":[46,85,104,185],"at":[48],"inference":[49],"time.":[50],"However,":[51],"the":[52,63,127,154,169,175],"former":[53],"often":[54],"lacks":[55],"explicit":[56],"modeling":[57],"of":[58],"motion-conditioned":[59],"state":[60],"transitions,":[61],"while":[62,173],"latter":[64],"incurs":[65],"substantial":[66],"computational":[67],"overhead.":[68],"In":[69],"work,":[71],"we":[72,100],"propose":[73],"World2VLM,":[74],"a":[75,83,88,96,102,130,134,207],"training":[76],"framework":[77],"distills":[79],"imagination":[81,205],"from":[82],"generative":[84],"model":[86,105,156],"into":[87],"vision-language":[89],"model.":[90],"Given":[91],"an":[92],"initial":[93],"observation":[94],"and":[95,112,120,141,164,209],"parameterized":[97],"camera":[98],"trajectory,":[99],"use":[101],"view-consistent":[103],"to":[106,202],"synthesize":[107],"geometrically":[108],"aligned":[109],"future":[110],"views":[111],"derive":[113],"structured":[114],"for":[116,177],"both":[117],"forward":[118],"(action-to-outcome)":[119],"inverse":[121],"(outcome-to-action)":[122],"reasoning.":[124],"We":[125],"post-train":[126],"VLM":[128],"two-stage":[131],"recipe":[132],"compact":[135],"dataset":[136],"generated":[137],"pipeline":[140],"evaluate":[142],"it":[143],"multiple":[145],"benchmarks.":[148],"World2VLM":[149],"delivers":[150],"consistent":[151],"improvements":[152],"over":[153],"base":[155],"across":[157],"diverse":[158],"benchmarks,":[159],"including":[160],"SAT-Real,":[161],"SAT-Synthesized,":[162],"VSI-Bench,":[163],"MindCube.":[165],"It":[166],"also":[167,195],"outperforms":[168],"test-time":[170],"world-model-coupled":[171],"methods":[172],"eliminating":[174],"need":[176],"expensive":[178],"inference-time":[179,192],"generation.":[180],"Our":[181],"results":[182],"suggest":[183],"can":[187],"serve":[188],"not":[189],"only":[190],"as":[191,196],"tools,":[193],"but":[194],"effective":[197],"training-time":[198],"teachers,":[199],"enabling":[200],"internalize":[203],"in":[206],"scalable":[208],"efficient":[210],"manner.":[211]},"counts_by_year":[],"updated_date":"2026-05-01T06:10:29.291645","created_date":"2026-05-01T00:00:00"}
