{"id":"https://openalex.org/W7138445157","doi":"https://doi.org/10.48550/arxiv.2603.14948","title":"Bridging Scene Generation and Planning: Driving with World Model via Unifying Vision and Motion Representation","display_name":"Bridging Scene Generation and Planning: Driving with World Model via Unifying Vision and Motion Representation","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7138445157","doi":"https://doi.org/10.48550/arxiv.2603.14948"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.14948","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14948","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.14948","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5007286889","display_name":"Xingtai Gui","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gui, Xingtai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019507261","display_name":"Meijie Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Meijie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129716468","display_name":"Tianyi Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Tianyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129650065","display_name":"Wencheng Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Wencheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129654093","display_name":"Jiahao Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Jiahao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129653202","display_name":"Feiyang Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Feiyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129684234","display_name":"Cheng-zhong Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Cheng-zhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129671295","display_name":"Jianbing Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Jianbing","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5007286889"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10586","display_name":"Robotic Path Planning Algorithms","score":0.3391000032424927,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10586","display_name":"Robotic Path Planning Algorithms","score":0.3391000032424927,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.20579999685287476,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11099","display_name":"Autonomous Vehicle Technology and Safety","score":0.08959999680519104,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.5146999955177307},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.491100013256073},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.4449999928474426},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.4154999852180481},{"id":"https://openalex.org/keywords/motion-planning","display_name":"Motion planning","score":0.397599995136261},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.38679999113082886},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.3628000020980835},{"id":"https://openalex.org/keywords/motion-estimation","display_name":"Motion estimation","score":0.3278999924659729}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7172999978065491},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6503999829292297},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5331000089645386},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.5146999955177307},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.491100013256073},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.4449999928474426},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.4154999852180481},{"id":"https://openalex.org/C81074085","wikidata":"https://www.wikidata.org/wiki/Q366872","display_name":"Motion planning","level":3,"score":0.397599995136261},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.38679999113082886},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.3628000020980835},{"id":"https://openalex.org/C10161872","wikidata":"https://www.wikidata.org/wiki/Q557891","display_name":"Motion estimation","level":2,"score":0.3278999924659729},{"id":"https://openalex.org/C146159030","wikidata":"https://www.wikidata.org/wiki/Q7625099","display_name":"Structure from motion","level":3,"score":0.3199999928474426},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.31299999356269836},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.3025999963283539},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3001999855041504},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C2781085045","wikidata":"https://www.wikidata.org/wiki/Q7318308","display_name":"Reversing","level":2,"score":0.2718000113964081},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.2712000012397766},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.25589999556541443}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.14948","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14948","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.14948","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14948","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/11","display_name":"Sustainable cities and communities","score":0.6211588978767395}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"End-to-end":[0],"autonomous":[1,241],"driving":[2,33,37,145],"aims":[3],"to":[4,53,108,138,172,193],"generate":[5,167],"safe":[6],"and":[7,46,56,68,85,91,114,122,135,163,195,207,236],"plausible":[8,123],"planning":[9,87,215],"policies":[10],"from":[11,188],"raw":[12],"sensor":[13],"input.":[14],"Driving":[15,99],"world":[16,38,175,191],"models":[17,39],"have":[18],"shown":[19],"great":[20],"potential":[21],"in":[22,199],"learning":[23],"rich":[24],"representations":[25,150],"by":[26,152],"predicting":[27],"the":[28,62,69,118,133,144,174,189,204,231],"future":[29,124,185],"evolution":[30],"of":[31,64,71,120,233],"a":[32,59,78,97,105,128,139,180],"scene.":[34],"However,":[35],"existing":[36],"primarily":[40],"focus":[41],"on":[42,104,127,148,203],"visual":[43,65,112,161],"scene":[44,66,83,153],"representation,":[45,160,162],"motion":[47,73,92,115,136,159,237],"representation":[48,187,238],"is":[49],"not":[50],"explicitly":[51],"designed":[52],"be":[54],"planner-shared":[55],"inheritable,":[57],"leaving":[58],"schism":[60],"between":[61,111,158],"optimization":[63],"generation":[67,84,119,225],"requirements":[70],"precise":[72],"planning.":[74],"We":[75,94,131],"present":[76],"WorldDrive,":[77],"holistic":[79],"framework":[80],"that":[81,211],"couples":[82],"real-time":[86],"via":[88],"unifying":[89,234],"vision":[90,134,235],"representation.":[93],"first":[95],"introduce":[96],"Trajectory-aware":[98],"World":[100],"Model,":[101],"which":[102,183],"conditions":[103],"trajectory":[106],"vocabulary":[107],"enforce":[109],"consistency":[110],"dynamics":[113],"intentions,":[116],"enabling":[117],"diverse":[121],"scenes":[125],"conditioned":[126],"specific":[129],"trajectory.":[130],"transfer":[132],"encoders":[137],"downstream":[140],"Multi-modal":[141],"Planner,":[142],"ensuring":[143],"policy":[146],"operates":[147],"mature":[149],"pre-optimized":[151],"generation.":[154],"A":[155],"simple":[156],"interaction":[157],"ego":[164],"status":[165],"can":[166],"high-quality,":[168],"multi-modal":[169],"trajectories.":[170],"Furthermore,":[171],"exploit":[173],"model's":[176],"foresight,":[177],"we":[178],"propose":[179],"Future-aware":[181],"Rewarder,":[182],"distills":[184],"latent":[186],"frozen":[190],"model":[192],"evaluate":[194],"select":[196],"optimal":[197],"trajectories":[198],"real-time.":[200],"Extensive":[201],"experiments":[202],"NAVSIM,":[205],"NAVSIM-v2,":[206],"nuScenes":[208],"benchmarks":[209],"demonstrate":[210],"WorldDrive":[212],"achieves":[213],"leading":[214],"performance":[216],"among":[217],"vision-only":[218],"methods":[219],"while":[220],"maintaining":[221],"high-fidelity":[222],"action-controlled":[223],"video":[224],"capabilities,":[226],"providing":[227],"strong":[228],"evidence":[229],"for":[230,239],"effectiveness":[232],"robust":[240],"driving.":[242]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
