{"id":"https://openalex.org/W7137819507","doi":"https://doi.org/10.48550/arxiv.2603.14851","title":"AutoMoT: A Unified Vision-Language-Action Model with Asynchronous Mixture-of-Transformers for End-to-End Autonomous Driving","display_name":"AutoMoT: A Unified Vision-Language-Action Model with Asynchronous Mixture-of-Transformers for End-to-End Autonomous Driving","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7137819507","doi":"https://doi.org/10.48550/arxiv.2603.14851"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.14851","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14851","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.14851","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129677899","display_name":"Wenhui Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Wenhui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104805610","display_name":"Songyan Zhang","orcid":"https://orcid.org/0009-0006-2853-8875"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Songyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129685629","display_name":"Qihang Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Qihang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100341233","display_name":"Qi Wang","orcid":"https://orcid.org/0000-0001-9099-9695"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhidong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102734535","display_name":"Zhiqi Mao","orcid":"https://orcid.org/0009-0003-9432-8655"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mao, Zhiqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129655588","display_name":"Collister Chua","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chua, Collister","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129735508","display_name":"Zhan Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zhan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129713475","display_name":"Long Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Long","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129649258","display_name":"Chen Lv","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lv, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.659500002861023,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.659500002861023,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11099","display_name":"Autonomous Vehicle Technology and Safety","score":0.11089999973773956,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.05889999866485596,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7121000289916992},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.6502000093460083},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5250999927520752},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5245000123977661},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.44429999589920044},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.4300999939441681},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.42879998683929443}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8068000078201294},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7121000289916992},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.6502000093460083},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5527999997138977},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5250999927520752},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5245000123977661},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.44429999589920044},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.4300999939441681},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.42879998683929443},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.40459999442100525},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.36000001430511475},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.33709999918937683},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3109999895095825},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.30970001220703125},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3027999997138977},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.14851","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14851","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.14851","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14851","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.8027100563049316,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Integrating":[0],"vision-language":[1],"models":[2],"(VLMs)":[3],"into":[4],"end-to-end":[5,70],"(E2E)":[6],"autonomous":[7],"driving":[8,57],"(AD)":[9],"systems":[10],"has":[11],"shown":[12],"promise":[13],"in":[14,66,147],"improving":[15],"scene":[16,165],"understanding.":[17],"However,":[18],"existing":[19],"integration":[20],"strategies":[21],"suffer":[22],"from":[23],"several":[24],"limitations:":[25],"they":[26],"either":[27],"struggle":[28],"to":[29,135,187],"resolve":[30],"distribution":[31],"misalignment":[32],"between":[33],"reasoning":[34,41,75,100],"and":[35,76,125,182,193],"action":[36,52,77],"spaces,":[37],"underexploit":[38],"the":[39,98,141,190],"general":[40,99],"capabilities":[42,101],"of":[43,102,144],"pretrained":[44],"VLMs,":[45],"or":[46],"incur":[47],"substantial":[48],"inference":[49,109],"latency":[50],"during":[51],"policy":[53],"generation,":[54],"which":[55,96],"degrades":[56],"performance.":[58],"To":[59],"address":[60],"these":[61],"challenges,":[62],"we":[63],"propose":[64],"AutoMoT":[65,130],"this":[67],"work,":[68],"an":[69],"AD":[71],"framework":[72],"that":[73,129,158],"unifies":[74],"generation":[78],"within":[79],"a":[80,88],"single":[81],"vision-language-action":[82],"(VLA)":[83],"model.":[84],"Our":[85,155],"approach":[86],"leverages":[87],"mixture-of-transformer":[89],"(MoT)":[90],"architecture":[91],"with":[92],"joint":[93],"attention":[94],"sharing,":[95],"preserves":[97],"pre-trained":[103,145,159],"VLMs":[104,146,160],"while":[105,172],"enabling":[106],"efficient":[107],"fast-slow":[108],"through":[110,168],"asynchronous":[111],"execution":[112],"at":[113],"different":[114],"task":[115],"frequencies.":[116],"Extensive":[117],"experiments":[118],"on":[119],"multiple":[120],"benchmarks,":[121],"under":[122],"both":[123],"open-":[124],"closed-loop":[126],"settings,":[127],"demonstrate":[128],"achieves":[131],"competitive":[132,163],"performance":[133,167],"compared":[134],"state-of-the-art":[136],"methods.":[137],"We":[138,185],"further":[139],"investigate":[140],"functional":[142],"boundary":[143],"AD,":[148],"examining":[149],"when":[150],"AD-tailored":[151],"fine-tuning":[152,173],"is":[153],"necessary.":[154],"results":[156],"show":[157],"can":[161],"achieve":[162],"multi-task":[164],"understanding":[166],"semantic":[169],"prompting":[170],"alone,":[171],"remains":[174],"essential":[175],"for":[176,189],"action-level":[177],"tasks":[178],"such":[179],"as":[180],"decision-making":[181],"trajectory":[183],"planning.":[184],"refer":[186],"https://automot-website.github.io/":[188],"demonstration":[191],"videos":[192],"qualitative":[194],"results.":[195]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-03-18T00:00:00"}
