{"id":"https://openalex.org/W7160921653","doi":"https://doi.org/10.48550/arxiv.2605.10177","title":"MTA-RL: Robust Urban Driving via Multi-modal Transformer-based 3D Affordances and Reinforcement Learning","display_name":"MTA-RL: Robust Urban Driving via Multi-modal Transformer-based 3D Affordances and Reinforcement Learning","publication_year":2026,"publication_date":"2026-05-11","ids":{"openalex":"https://openalex.org/W7160921653","doi":"https://doi.org/10.48550/arxiv.2605.10177"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.10177","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10177","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.10177","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135927146","display_name":"Guangli Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Guangli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077957422","display_name":"Dianzhao Li","orcid":"https://orcid.org/0000-0002-6193-4101"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Dianzhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009241314","display_name":"Wenjian Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Wenjian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026451492","display_name":"Bangquan Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Bangquan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135961623","display_name":"Ostap Okhrin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Okhrin, Ostap","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11099","display_name":"Autonomous Vehicle Technology and Safety","score":0.4083999991416931,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11099","display_name":"Autonomous Vehicle Technology and Safety","score":0.4083999991416931,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.2248000055551529,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.06639999896287918,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/affordance","display_name":"Affordance","score":0.7824000120162964},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7566999793052673},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.4837000072002411},{"id":"https://openalex.org/keywords/lidar","display_name":"Lidar","score":0.41760000586509705},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.39750000834465027},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.38019999861717224},{"id":"https://openalex.org/keywords/rgb-color-model","display_name":"RGB color model","score":0.3799999952316284}],"concepts":[{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.7824000120162964},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7566999793052673},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6621999740600586},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5552999973297119},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.4837000072002411},{"id":"https://openalex.org/C51399673","wikidata":"https://www.wikidata.org/wiki/Q504027","display_name":"Lidar","level":2,"score":0.41760000586509705},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.39750000834465027},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.38019999861717224},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.3799999952316284},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.37369999289512634},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.3734999895095825},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3314000070095062},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.28529998660087585},{"id":"https://openalex.org/C79487989","wikidata":"https://www.wikidata.org/wiki/Q934680","display_name":"Vehicle dynamics","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.27709999680519104},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C131979681","wikidata":"https://www.wikidata.org/wiki/Q1899648","display_name":"Point cloud","level":2,"score":0.25049999356269836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.10177","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10177","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.10177","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10177","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8362898826599121,"id":"https://metadata.un.org/sdg/11","display_name":"Sustainable cities and communities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Robust":[0],"urban":[1,185],"autonomous":[2,186],"driving":[3,96],"requires":[4],"reliable":[5],"3D":[6,46],"scene":[7],"understanding":[8],"and":[9,41,48,62,103,151,167,175],"stable":[10],"decision-making":[11],"under":[12],"dense":[13],"interactions.":[14],"However,":[15],"existing":[16],"end-to-end":[17],"models":[18,55],"lack":[19],"interpretability,":[20],"while":[21],"modular":[22],"pipelines":[23],"suffer":[24],"from":[25],"error":[26],"propagation":[27],"across":[28,110],"brittle":[29],"interfaces.":[30],"This":[31],"paper":[32],"proposes":[33],"MTA-RL,":[34],"the":[35,88,179],"first":[36],"framework":[37],"that":[38,56,117,163],"bridges":[39],"perception":[40],"control":[42],"through":[43],"Multi-modal":[44],"Transformer-based":[45],"Affordances":[47],"Reinforcement":[49],"Learning":[50],"(RL).":[51],"Unlike":[52],"previous":[53],"fusion":[54,166],"directly":[57],"regress":[58],"actions,":[59],"RGB":[60],"images":[61],"LiDAR":[63],"point":[64],"clouds":[65],"are":[66,170],"fused":[67],"using":[68],"a":[69,83,139],"transformer":[70],"architecture":[71],"to":[72,91,138],"predict":[73],"explicit,":[74],"geometry-aware":[75],"affordance":[76],"representations.":[77],"These":[78],"structured":[79],"representations":[80],"serve":[81],"as":[82],"compact":[84],"observation":[85],"space,":[86],"enabling":[87],"RL":[89],"policy":[90],"operate":[92],"purely":[93],"on":[94,125],"predicted":[95],"semantics,":[97],"which":[98],"significantly":[99,172],"improves":[100],"sample":[101],"efficiency":[102],"stability.":[104],"Extensive":[105],"evaluations":[106],"in":[107,133,142,148,155],"CARLA":[108],"Town01-03":[109],"varying":[111],"densities":[112],"(20-60":[113],"background":[114],"vehicles)":[115],"show":[116],"MTA-RL":[118,182],"consistently":[119],"outperforms":[120],"state-of-the-art":[121],"baselines.":[122],"Trained":[123],"solely":[124],"Town03,":[126],"our":[127,164],"method":[128],"demonstrates":[129],"superior":[130],"zero-shot":[131],"generalization":[132],"unseen":[134],"towns,":[135],"achieving":[136],"up":[137],"9.0%":[140],"increase":[141,147],"Route":[143],"Completion,":[144],"an":[145,152],"11.0%":[146],"Total":[149],"Distance,":[150],"83.7%":[153],"improvement":[154],"Distance":[156],"Per":[157],"Violation.":[158],"Furthermore,":[159],"ablation":[160],"studies":[161],"confirm":[162],"multi-modal":[165],"reward":[168],"shaping":[169],"critical,":[171],"outperforming":[173],"image-only":[174],"unshaped":[176],"variants,":[177],"demonstrating":[178],"effectiveness":[180],"of":[181],"for":[183],"robust":[184],"driving.":[187]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
