{"id":"https://openalex.org/W7141319652","doi":"https://doi.org/10.48550/arxiv.2603.25741","title":"Vega: Learning to Drive with Natural Language Instructions","display_name":"Vega: Learning to Drive with Natural Language Instructions","publication_year":2026,"publication_date":"2026-03-26","ids":{"openalex":"https://openalex.org/W7141319652","doi":"https://doi.org/10.48550/arxiv.2603.25741"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.25741","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25741","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.25741","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130809212","display_name":"Sicheng Zuo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zuo, Sicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130765841","display_name":"Yuxuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yuxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130769004","display_name":"Wenzhao Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Wenzhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130809955","display_name":"Zheng Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Zheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130772615","display_name":"Jie Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Jie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5103740674","display_name":"Jiwen Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Jiwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9696999788284302,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9696999788284302,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.0032999999821186066,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.002199999988079071,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5995000004768372},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5615000128746033},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5478000044822693},{"id":"https://openalex.org/keywords/flexibility","display_name":"Flexibility (engineering)","score":0.5386000275611877},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5045999884605408},{"id":"https://openalex.org/keywords/natural-language-understanding","display_name":"Natural language understanding","score":0.4952000081539154},{"id":"https://openalex.org/keywords/natural-language-generation","display_name":"Natural language generation","score":0.40540000796318054},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.39730000495910645}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8133000135421753},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5995000004768372},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5644000172615051},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5615000128746033},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5478000044822693},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.5386000275611877},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5098000168800354},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5045999884605408},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.4952000081539154},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4129999876022339},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.40540000796318054},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.39730000495910645},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.39480000734329224},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3684999942779541},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.36070001125335693},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.35989999771118164},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.34130001068115234},{"id":"https://openalex.org/C57493831","wikidata":"https://www.wikidata.org/wiki/Q3134666","display_name":"Projection (relational algebra)","level":2,"score":0.34049999713897705},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3310000002384186},{"id":"https://openalex.org/C94922259","wikidata":"https://www.wikidata.org/wiki/Q33215","display_name":"Constructed language","level":2,"score":0.3125999867916107},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.2842000126838684},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.27459999918937683}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.25741","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25741","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.25741","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25741","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.703826367855072,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-language-action":[0],"models":[1],"have":[2],"reshaped":[3],"autonomous":[4],"driving":[5,47,57,150],"to":[6,31,81,94,107],"incorporate":[7],"languages":[8],"into":[9],"the":[10,19,29,60,78,91,111,143],"decision-making":[11],"process.":[12],"However,":[13],"most":[14],"existing":[15],"pipelines":[16],"only":[17,131],"utilize":[18],"language":[20,87],"modality":[21],"for":[22,36,71,118,121,145],"scene":[23],"descriptions":[24],"or":[25],"reasoning":[26],"and":[27,74,86,90,100,113,148],"lack":[28],"flexibility":[30],"follow":[32],"diverse":[33,56],"user":[34],"instructions":[35,58,88],"personalized":[37,149],"driving.":[38],"To":[39],"address":[40],"this,":[41],"we":[42],"first":[43],"construct":[44],"a":[45,66],"large-scale":[46],"dataset":[48],"(InstructScene)":[49],"containing":[50],"around":[51],"100,000":[52],"scenes":[53],"annotated":[54],"with":[55,59],"corresponding":[61],"trajectories.":[62],"We":[63,76,103],"then":[64],"propose":[65],"unified":[67],"Vision-Language-World-Action":[68],"model,":[69],"Vega,":[70],"instruction-based":[72],"generation":[73],"planning.":[75],"employ":[77],"autoregressive":[79],"paradigm":[80,93],"process":[82],"visual":[83],"inputs":[84],"(vision)":[85],"(language)":[89],"diffusion":[92],"generate":[95],"future":[96],"predictions":[97],"(world":[98],"modeling)":[99],"trajectories":[101],"(action).":[102],"perform":[104],"joint":[105],"attention":[106],"enable":[108],"interactions":[109],"between":[110],"modalities":[112,120],"use":[114],"individual":[115],"projection":[116],"layers":[117],"different":[119],"more":[122,146],"capabilities.":[123],"Extensive":[124],"experiments":[125],"demonstrate":[126],"that":[127],"our":[128],"method":[129],"not":[130],"achieves":[132],"superior":[133],"planning":[134],"performance":[135],"but":[136],"also":[137],"exhibits":[138],"strong":[139],"instruction-following":[140],"abilities,":[141],"paving":[142],"way":[144],"intelligent":[147],"systems.":[151]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-28T00:00:00"}
