{"id":"https://openalex.org/W7141396159","doi":"https://doi.org/10.48550/arxiv.2603.25406","title":"MMaDA-VLA: Large Diffusion Vision-Language-Action Model with Unified Multi-Modal Instruction and Generation","display_name":"MMaDA-VLA: Large Diffusion Vision-Language-Action Model with Unified Multi-Modal Instruction and Generation","publication_year":2026,"publication_date":"2026-03-26","ids":{"openalex":"https://openalex.org/W7141396159","doi":"https://doi.org/10.48550/arxiv.2603.25406"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.25406","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25406","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.25406","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130799799","display_name":"Yang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130815285","display_name":"Pengxiang Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Pengxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122346148","display_name":"Tengyue Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Tengyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130728520","display_name":"Xudong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xudong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130758710","display_name":"Wenxuan Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Wenxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130776718","display_name":"Minghui Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Minghui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130744189","display_name":"Han Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130762583","display_name":"Hongyin Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Hongyin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034760558","display_name":"Zifeng Zhuang","orcid":"https://orcid.org/0009-0008-2667-6099"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuang, Zifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130780579","display_name":"Wei Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130761619","display_name":"Siteng Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Siteng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130744464","display_name":"Jinkui Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Jinkui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130814504","display_name":"Donglin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Donglin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5130799799"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9492999911308289,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9492999911308289,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.010599999688565731,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.006500000134110451,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5641000270843506},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.49779999256134033},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.47189998626708984},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4569000005722046},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.43950000405311584},{"id":"https://openalex.org/keywords/a-priori-and-a-posteriori","display_name":"A priori and a posteriori","score":0.4300999939441681},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.4212000072002411},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.4002000093460083},{"id":"https://openalex.org/keywords/train","display_name":"Train","score":0.3953000009059906}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7171000242233276},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5641000270843506},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.49779999256134033},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.47189998626708984},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4569000005722046},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.43950000405311584},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.4300999939441681},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.4212000072002411},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.4002000093460083},{"id":"https://openalex.org/C190839683","wikidata":"https://www.wikidata.org/wiki/Q2448197","display_name":"Train","level":2,"score":0.3953000009059906},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3912000060081482},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.3603000044822693},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.35190001130104065},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.31929999589920044},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.31619998812675476},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.30320000648498535},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.29899999499320984},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.2937000095844269},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.2904999852180481},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.28040000796318054},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.2680000066757202},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.26739999651908875},{"id":"https://openalex.org/C125209513","wikidata":"https://www.wikidata.org/wiki/Q4037520","display_name":"Doors","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2524999976158142},{"id":"https://openalex.org/C55689738","wikidata":"https://www.wikidata.org/wiki/Q15963867","display_name":"Discrete time and continuous time","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.25406","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25406","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.25406","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25406","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5904673337936401,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language-Action":[0],"(VLA)":[1],"models":[2],"aim":[3],"to":[4,37,99],"control":[5],"robots":[6],"for":[7],"manipulation":[8],"from":[9,26],"visual":[10,127],"observations":[11],"and":[12,18,29,33,62,81,90,106,137,149],"natural-language":[13],"instructions.":[14],"However,":[15],"existing":[16],"hierarchical":[17],"autoregressive":[19],"paradigms":[20],"often":[21],"introduce":[22],"architectural":[23],"overhead,":[24],"suffer":[25],"temporal":[27],"inconsistency":[28],"long-horizon":[30,119],"error":[31],"accumulation,":[32],"lack":[34],"a":[35,50,65,72,92,102],"mechanism":[36],"capture":[38],"environment":[39],"dynamics":[40],"without":[41,129],"extra":[42],"modules.":[43],"To":[44],"this":[45],"end,":[46],"we":[47],"present":[48],"MMaDA-VLA,":[49],"fully":[51],"native":[52,73],"pre-trained":[53],"large":[54],"diffusion":[55,75],"VLA":[56],"model":[57],"that":[58,77],"unifies":[59],"multi-modal":[60],"understanding":[61],"generation":[63],"in":[64,110,124],"single":[66,93],"framework.":[67],"Our":[68],"key":[69],"idea":[70],"is":[71],"discrete":[74,87],"formulation":[76],"embeds":[78],"language,":[79],"images,":[80],"continuous":[82],"robot":[83],"controls":[84],"into":[85],"one":[86],"token":[88,97],"space":[89],"trains":[91],"backbone":[94],"with":[95],"masked":[96],"denoising":[98,113],"jointly":[100],"generate":[101],"future":[103,126],"goal":[104],"observation":[105],"an":[107],"action":[108],"chunk":[109],"parallel.":[111],"Iterative":[112],"enables":[114],"global,":[115],"order-free":[116],"refinement,":[117],"improving":[118],"consistency":[120],"while":[121],"grounding":[122],"actions":[123],"predicted":[125],"outcomes":[128],"auxiliary":[130],"world":[131],"models.":[132],"Experiments":[133],"across":[134],"simulation":[135],"benchmarks":[136],"real-world":[138],"tasks":[139],"show":[140],"state-of-the-art":[141],"performance,":[142],"achieving":[143],"98.0%":[144],"average":[145,151],"success":[146],"on":[147,153],"LIBERO":[148],"4.78":[150],"length":[152],"CALVIN.":[154]},"counts_by_year":[],"updated_date":"2026-05-04T08:30:34.212998","created_date":"2026-03-28T00:00:00"}
