{"id":"https://openalex.org/W7131144909","doi":"https://doi.org/10.48550/arxiv.2602.17951","title":"ROCKET: Residual-Oriented Multi-Layer Alignment for Spatially-Aware Vision-Language-Action Models","display_name":"ROCKET: Residual-Oriented Multi-Layer Alignment for Spatially-Aware Vision-Language-Action Models","publication_year":2026,"publication_date":"2026-02-20","ids":{"openalex":"https://openalex.org/W7131144909","doi":"https://doi.org/10.48550/arxiv.2602.17951"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.17951","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126633522","display_name":"Guoheng Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sun, Guoheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102312090","display_name":"Tingting Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Tingting","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066290884","display_name":"Kaixi Feng","orcid":"https://orcid.org/0000-0001-8095-8507"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Kaixi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126598911","display_name":"Chenxiang Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Chenxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123048975","display_name":"Xingguo Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Xingguo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126650564","display_name":"Zheyu Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Zheyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126588005","display_name":"Ziyao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Ziyao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049277933","display_name":"Yun He","orcid":"https://orcid.org/0000-0001-8275-0494"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Yexiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126635824","display_name":"Ang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Ang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5126633522"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7073000073432922,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7073000073432922,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.12359999865293503,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.03680000081658363,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6535000205039978},{"id":"https://openalex.org/keywords/projector","display_name":"Projector","score":0.6315000057220459},{"id":"https://openalex.org/keywords/rocket","display_name":"Rocket (weapon)","score":0.6010000109672546},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.597000002861023},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5076000094413757},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.4571000039577484},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3977999985218048},{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.39250001311302185}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7695000171661377},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6535000205039978},{"id":"https://openalex.org/C2776865275","wikidata":"https://www.wikidata.org/wiki/Q311666","display_name":"Projector","level":2,"score":0.6315000057220459},{"id":"https://openalex.org/C187878255","wikidata":"https://www.wikidata.org/wiki/Q2037215","display_name":"Rocket (weapon)","level":2,"score":0.6010000109672546},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.597000002861023},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5144000053405762},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5076000094413757},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.4571000039577484},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.42329999804496765},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3977999985218048},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.39250001311302185},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.3776000142097473},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.3707999885082245},{"id":"https://openalex.org/C50637493","wikidata":"https://www.wikidata.org/wiki/Q1136781","display_name":"Morphing","level":2,"score":0.34540000557899475},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C173974348","wikidata":"https://www.wikidata.org/wiki/Q1469893","display_name":"Fiducial marker","level":2,"score":0.2827000021934509},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.28189998865127563},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.28060001134872437},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2752000093460083},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2565000057220459},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2515000104904175}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.17951","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.17951","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.17951","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.17951","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.497341126203537,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language-Action":[0],"(VLA)":[1],"models":[2],"enable":[3],"instruction-following":[4],"robotic":[5],"manipulation,":[6],"but":[7],"they":[8],"are":[9],"typically":[10],"pretrained":[11],"on":[12,183],"2D":[13,37],"data":[14],"and":[15,126,136,140,195,205],"lack":[16],"3D":[17,109],"spatial":[18],"understanding.":[19],"An":[20],"effective":[21],"approach":[22],"is":[23,32,134],"representation":[24,75],"alignment,":[25],"where":[26],"a":[27,36,48,72,92,107,114,131,143,163],"strong":[28],"vision":[29,110],"foundation":[30,111],"model":[31,112,206],"used":[33],"to":[34,52,87,95,152],"guide":[35],"VLA":[38,101,201],"model.":[39],"However,":[40],"existing":[41],"methods":[42],"usually":[43],"apply":[44],"supervision":[45],"at":[46,211],"only":[47,170],"single":[49],"layer,":[50],"failing":[51],"fully":[53],"exploit":[54],"the":[55,100,149,174,188],"rich":[56],"information":[57],"distributed":[58],"across":[59,193],"depth;":[60],"meanwhile,":[61],"na\u00efve":[62],"multi-layer":[63,74,80],"alignment":[64,76,81,155],"can":[65,208],"cause":[66],"gradient":[67,119],"interference.":[68],"We":[69,121,185],"introduce":[70],"ROCKET,":[71],"residual-oriented":[73],"framework":[77],"that":[78,130],"formulates":[79],"as":[82,197,199],"aligning":[83],"one":[84],"residual":[85],"stream":[86],"another.":[88],"Concretely,":[89],"ROCKET":[90,168,192],"employs":[91],"shared":[93,132,150],"projector":[94,133,151],"align":[96],"multiple":[97,104,154,200],"layers":[98,105],"of":[99,106,173,191],"backbone":[102],"with":[103,162],"powerful":[108],"via":[113],"layer-invariant":[115],"mapping,":[116],"which":[117],"reduces":[118],"conflicts.":[120],"provide":[122],"both":[123],"theoretical":[124],"justification":[125],"empirical":[127],"analyses":[128],"showing":[129],"sufficient":[135],"outperforms":[137],"prior":[138],"designs,":[139],"further":[141,186],"propose":[142],"Matryoshka-style":[144],"sparse":[145],"activation":[146],"scheme":[147],"for":[148],"balance":[153],"losses.":[156],"Our":[157],"experiments":[158],"show":[159],"that,":[160],"combined":[161],"training-free":[164],"layer":[165],"selection":[166],"strategy,":[167],"requires":[169],"about":[171],"4%":[172],"compute":[175],"budget":[176],"while":[177],"achieving":[178],"98.5%":[179],"state-of-the-art":[180],"success":[181],"rate":[182],"LIBERO.":[184],"demonstrate":[187],"superior":[189],"performance":[190],"LIBERO-Plus":[194],"RoboTwin,":[196],"well":[198],"models.":[202],"The":[203],"code":[204],"weights":[207],"be":[209],"found":[210],"https://github.com/CASE-Lab-UMD/ROCKET-VLA.":[212]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-24T00:00:00"}
