{"id":"https://openalex.org/W7162458364","doi":"https://doi.org/10.48550/arxiv.2605.25044","title":"X-DiffVLA: X-Embodied Diffusion Action Heads for Vision-Language-Action Models","display_name":"X-DiffVLA: X-Embodied Diffusion Action Heads for Vision-Language-Action Models","publication_year":2026,"publication_date":"2026-05-24","ids":{"openalex":"https://openalex.org/W7162458364","doi":"https://doi.org/10.48550/arxiv.2605.25044"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.25044","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25044","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.25044","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137047217","display_name":"Boyu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Boyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137019366","display_name":"Chaoyi Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Chaoyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137005476","display_name":"Haoqi Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Haoqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137074996","display_name":"Xinrun Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Xinrun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134929658","display_name":"B\u00f6rje F. Karlsson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Karlsson, B\u00f6rje F.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137083149","display_name":"Dongbin Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haoran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137073907","display_name":"Haoran Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Zongqing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137057158","display_name":"Zongqing Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Dongbin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.454800009727478,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.454800009727478,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.13099999725818634,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.08049999922513962,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6794000267982483},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6309000253677368},{"id":"https://openalex.org/keywords/grippers","display_name":"Grippers","score":0.590399980545044},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5076000094413757},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4821000099182129},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.45350000262260437},{"id":"https://openalex.org/keywords/transferability","display_name":"Transferability","score":0.4049000144004822},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.39010000228881836}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6998999714851379},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6794000267982483},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6309000253677368},{"id":"https://openalex.org/C2775960376","wikidata":"https://www.wikidata.org/wiki/Q1435859","display_name":"Grippers","level":2,"score":0.590399980545044},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5356000065803528},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5076000094413757},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4821000099182129},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.45350000262260437},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.43470001220703125},{"id":"https://openalex.org/C61272859","wikidata":"https://www.wikidata.org/wiki/Q7834031","display_name":"Transferability","level":3,"score":0.4049000144004822},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.39010000228881836},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.3785000145435333},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.367900013923645},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.362199991941452},{"id":"https://openalex.org/C113174947","wikidata":"https://www.wikidata.org/wiki/Q2859736","display_name":"Tree (set theory)","level":2,"score":0.33889999985694885},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.3305000066757202},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.3059999942779541},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.2892000079154968},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C163797641","wikidata":"https://www.wikidata.org/wiki/Q2067937","display_name":"Tree structure","level":3,"score":0.2727000117301941},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.2547000050544739}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.25044","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25044","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.25044","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25044","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"score":0.4985782504081726,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Learning":[0],"universal":[1],"policies":[2],"from":[3,158],"cross-embodied":[4,59,78,99,191],"data":[5],"remains":[6],"a":[7,71,76,106,128],"fundamental":[8],"challenge":[9],"in":[10,33,98,189],"robotics.":[11],"Although":[12],"Vision-Language-Action":[13],"(VLA)":[14],"models":[15,89],"are":[16],"pre-trained":[17],"on":[18,26,58],"large":[19],"and":[20,43,65,68,95,152,173,186],"diverse":[21,140],"datasets,":[22],"they":[23],"typically":[24],"rely":[25],"embodiment-specific":[27,116],"fine-tuning":[28],"to":[29,90,110,135,160],"achieve":[30],"strong":[31],"performance":[32],"downstream":[34],"tasks.":[35,51],"This":[36],"requirement":[37],"severely":[38],"limits":[39],"their":[40],"generalization":[41],"capability":[42],"restricts":[44],"knowledge":[45],"transfer":[46],"across":[47,139,150],"embodiments":[48,157],"performing":[49],"similar":[50],"To":[52],"overcome":[53],"these":[54],"limitations,":[55],"we":[56,102],"focus":[57],"settings":[60],"with":[61,169],"shared":[62],"robotic":[63],"bases":[64],"heterogeneous":[66,146],"end-effectors,":[67,141],"propose":[69],"X-DiffVLA,":[70],"diffusion-based":[72],"VLA":[73],"model":[74],"featuring":[75],"unified":[77],"action":[79,113],"head.":[80],"X-DiffVLA":[81,165],"can":[82],"leverage":[83],"the":[84,93,143,180,183],"generative":[85],"strengths":[86],"of":[87,145,171,182],"diffusion":[88],"capture":[91],"both":[92],"diversity":[94],"latent":[96],"correlations":[97,138],"datasets.":[100],"Specifically,":[101],"introduce":[103],"Embodiment":[104],"Forcing,":[105],"classifier-free":[107],"guidance":[108],"technique":[109],"implicitly":[111],"steer":[112],"generation":[114],"toward":[115],"functional":[117],"components,":[118],"capturing":[119],"fine-grained":[120],"structural":[121],"nuances":[122],"without":[123],"explicit":[124],"supervision.":[125],"In":[126],"addition,":[127],"Morphological":[129],"Tree":[130],"Diffusion":[131],"approach":[132],"is":[133],"designed":[134],"strengthen":[136],"behavioral":[137],"maximizing":[142],"transferability":[144],"demonstrations.":[147],"Experimental":[148],"results":[149],"RoboCasa":[151],"Isaac":[153],"Gym,":[154],"covering":[155],"different":[156],"grippers":[159],"dexterous":[161],"hands,":[162],"show":[163],"that":[164],"achieves":[166],"state-of-the-art":[167],"performance,":[168],"improvements":[170],"15.3%":[172],"12.5%,":[174],"respectively.":[175],"Real-world":[176],"evaluations":[177],"further":[178],"validate":[179],"robustness":[181],"proposed":[184],"framework":[185],"its":[187],"effectiveness":[188],"scalable":[190],"policy":[192],"learning.":[193]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-27T00:00:00"}
