{"id":"https://openalex.org/W7161003922","doi":"https://doi.org/10.48550/arxiv.2605.11809","title":"Beyond World-Frame Action Heads: Motion-Centric Action Frames for Vision-Language-Action Models","display_name":"Beyond World-Frame Action Heads: Motion-Centric Action Frames for Vision-Language-Action Models","publication_year":2026,"publication_date":"2026-05-12","ids":{"openalex":"https://openalex.org/W7161003922","doi":"https://doi.org/10.48550/arxiv.2605.11809"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.11809","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11809","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.11809","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114178241","display_name":"Huoren Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Huoren","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101196282","display_name":"Jianchao Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Jianchao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136044659","display_name":"Hu Yusong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yusong, Hu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136023597","display_name":"Qiguan Ou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ou, Qiguan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136012577","display_name":"Yuyang Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Yuyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136060526","display_name":"Wei Ke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ke, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136007869","display_name":"Yuhang He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Yuhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136056059","display_name":"SongLin Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, SongLin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136077443","display_name":"Zhiheng Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Zhiheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136064736","display_name":"Yihong Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Yihong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6410999894142151,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6410999894142151,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.20229999721050262,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.02410000003874302,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.7861999869346619},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.703000009059906},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6851999759674072},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6133000254631042},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5626000165939331},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.5242000222206116},{"id":"https://openalex.org/keywords/rotation","display_name":"Rotation (mathematics)","score":0.504800021648407}],"concepts":[{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.7861999869346619},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.703000009059906},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6851999759674072},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6304000020027161},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6133000254631042},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5626000165939331},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.5242000222206116},{"id":"https://openalex.org/C74050887","wikidata":"https://www.wikidata.org/wiki/Q848368","display_name":"Rotation (mathematics)","level":2,"score":0.504800021648407},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4805999994277954},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.41589999198913574},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3912000060081482},{"id":"https://openalex.org/C74992021","wikidata":"https://www.wikidata.org/wiki/Q184876","display_name":"Frame of reference","level":2,"score":0.3817000091075897},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.358599990606308},{"id":"https://openalex.org/C104065381","wikidata":"https://www.wikidata.org/wiki/Q1002535","display_name":"Geometric modeling","level":2,"score":0.3564999997615814},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.31040000915527344},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2928999960422516},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.28999999165534973},{"id":"https://openalex.org/C172849965","wikidata":"https://www.wikidata.org/wiki/Q3148875","display_name":"Reference frame","level":3,"score":0.2847999930381775},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.25450000166893005},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.25209999084472656}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.11809","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11809","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.11809","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11809","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language-Action":[0],"(VLA)":[1],"models":[2],"have":[3],"advanced":[4],"rapidly":[5],"with":[6,44,121,135],"stronger":[7],"backbones,":[8],"broader":[9],"pre-training,":[10],"and":[11,50,78,142,167,181],"larger":[12],"demonstration":[13],"datasets,":[14],"yet":[15],"their":[16],"action":[17,25,38,53,172],"heads":[18],"remain":[19],"largely":[20],"homogeneous:":[21],"most":[22],"directly":[23],"predict":[24],"commands":[26],"in":[27,68,127,192],"a":[28,36,45,51,61,74,112],"fixed":[29],"world":[30,84],"coordinate":[31],"frame.":[32],"We":[33],"propose":[34],"\\textbf{MCF-Proto},":[35],"lightweight":[37,165],"head":[39,173],"that":[40,163],"equips":[41],"VLA":[42,178],"policies":[43,179],"Motion-Centric":[46],"Action":[47],"Frame":[48],"(MCF)":[49],"prototype-based":[52],"parameterization.":[54],"At":[55],"each":[56],"step,":[57],"the":[58,69,83,107,128,171,193],"policy":[59],"predicts":[60],"rotation":[62],"$R_t":[63],"\\in":[64],"SO(3)$,":[65],"composes":[66],"actions":[67,126],"transformed":[70],"local":[71,109],"frame":[72,85],"from":[73],"set":[75],"of":[76],"prototypes,":[77],"maps":[79],"them":[80],"back":[81],"to":[82,170],"for":[86],"end-to-end":[87],"training,":[88],"using":[89],"only":[90],"standard":[91],"demonstrations":[92],"without":[93],"auxiliary":[94],"supervision.":[95],"This":[96],"simple":[97],"design":[98],"induces":[99],"stable":[100,113],"emergent":[101],"structure.":[102],"Without":[103],"explicit":[104],"directional":[105],"labels,":[106],"learned":[108,129],"frames":[110],"develop":[111],"geometric":[114,158,166],"structure":[115,169],"whose":[116],"axes":[117],"are":[118],"strongly":[119],"compatible":[120],"demonstrated":[122],"end-effector":[123],"motion.":[124],"Meanwhile,":[125],"representation":[130],"become":[131],"substantially":[132],"more":[133,143],"compact,":[134],"variation":[136],"captured":[137],"by":[138,146],"fewer":[139],"dominant":[140],"directions":[141],"regularly":[144],"organized":[145],"shared":[147],"prototypes.":[148],"These":[149],"structural":[150],"properties":[151],"translate":[152],"into":[153],"improved":[154],"robustness,":[155],"especially":[156],"under":[157],"perturbations.":[159],"Our":[160],"results":[161],"suggest":[162],"adding":[164],"compositional":[168],"can":[174],"materially":[175],"improve":[176],"how":[177],"organize":[180],"generalize":[182],"robotic":[183],"manipulation":[184],"behavior.":[185],"An":[186],"anonymized":[187],"code":[188],"repository":[189],"is":[190],"provided":[191],"supplementary":[194],"material.":[195]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-14T00:00:00"}
