{"id":"https://openalex.org/W7161019990","doi":"https://doi.org/10.48550/arxiv.2605.12369","title":"GuidedVLA: Specifying Task-Relevant Factors via Plug-and-Play Action Attention Specialization","display_name":"GuidedVLA: Specifying Task-Relevant Factors via Plug-and-Play Action Attention Specialization","publication_year":2026,"publication_date":"2026-05-12","ids":{"openalex":"https://openalex.org/W7161019990","doi":"https://doi.org/10.48550/arxiv.2605.12369"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.12369","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.12369","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.12369","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136083996","display_name":"Xiaosong Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jia, Xiaosong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136010974","display_name":"Bowen Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Bowen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043002448","display_name":"Zuhao Ge","orcid":"https://orcid.org/0000-0002-0047-6709"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Zuhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051917669","display_name":"\u8042\u663e","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nie, Xian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136045776","display_name":"Yuchen Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yuchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136012834","display_name":"Cunxin Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Cunxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136055878","display_name":"Yufeng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yufeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136039928","display_name":"Yilin Chai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chai, Yilin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136073509","display_name":"Chao Jing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jing, Chao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136056688","display_name":"Zijian Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Zijian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136002837","display_name":"Qingwen Bu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bu, Qingwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136082918","display_name":"Haidong Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Haidong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136036492","display_name":"Chao Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Chao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136048175","display_name":"Qifeng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Qifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136023241","display_name":"Zhenjie Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhenjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128789657","display_name":"Chenhe Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chenhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136034706","display_name":"Hongyang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hongyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136060326","display_name":"Zuxuan Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zuxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136025477","display_name":"Junchi Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Junchi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136023551","display_name":"Yu-Gang Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Yu-Gang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":20,"corresponding_author_ids":["https://openalex.org/A5136083996"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6248000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6248000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.09309999644756317,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.06239999830722809,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.710099995136261},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.6276999711990356},{"id":"https://openalex.org/keywords/spurious-relationship","display_name":"Spurious relationship","score":0.6270999908447266},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.576200008392334},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5688999891281128},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5489000082015991},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5385000109672546},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.512499988079071},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.39309999346733093}],"concepts":[{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.710099995136261},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7035999894142151},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.6276999711990356},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.6270999908447266},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6254000067710876},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.576200008392334},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5688999891281128},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5489000082015991},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5385000109672546},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.512499988079071},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4706999957561493},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.41200000047683716},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.39309999346733093},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.3537999987602234},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.33899998664855957},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.33889999985694885},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.3375999927520752},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.33660000562667847},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.3237000107765198},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.29840001463890076},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.27799999713897705},{"id":"https://openalex.org/C183759332","wikidata":"https://www.wikidata.org/wiki/Q343680","display_name":"Action learning","level":4,"score":0.2777000069618225},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.27000001072883606},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.25769999623298645}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.12369","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.12369","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.12369","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.12369","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.483043909072876,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language-Action":[0],"(VLA)":[1],"models":[2,41],"aim":[3],"for":[4,187],"general":[5,192],"robot":[6],"learning":[7,182],"by":[8,103],"aligning":[9],"action":[10,29,70,84],"as":[11,48,87,92],"a":[12,63,88,184],"modality":[13],"within":[14],"powerful":[15],"Vision-Language":[16],"Models":[17],"(VLMs).":[18],"Existing":[19],"VLAs":[20],"rely":[21],"on":[22,74],"end-to-end":[23],"supervision":[24],"to":[25,32,44,66,72,81,108,148],"implicitly":[26],"enable":[27],"the":[28,69,83,156],"decoding":[30],"process":[31],"learn":[33],"task-relevant":[34,75],"features.":[35,174],"However,":[36],"without":[37],"explicit":[38],"guidance,":[39],"these":[40,159],"often":[42],"overfit":[43],"spurious":[45],"correlations,":[46],"such":[47],"visual":[49],"shortcuts":[50],"or":[51],"environmental":[52],"noise,":[53],"limiting":[54],"their":[55],"generalization.":[56],"In":[57],"this":[58,118],"paper,":[59],"we":[60,116,153],"introduce":[61],"GuidedVLA,":[62],"framework":[64],"designed":[65],"manually":[67,104],"guide":[68],"generation":[71],"focus":[73],"factors.":[76,111],"Our":[77,175],"core":[78],"insight":[79],"is":[80,183],"treat":[82],"decoder":[85],"not":[86],"monolithic":[89],"learner,":[90],"but":[91],"an":[93,113],"assembly":[94],"of":[95,158],"functional":[96],"components.":[97],"Individual":[98],"attention":[99],"heads":[100],"are":[101],"supervised":[102],"defined":[105],"auxiliary":[106],"signals":[107],"capture":[109],"distinct":[110],"As":[112],"initial":[114],"study,":[115],"instantiate":[117],"paradigm":[119],"with":[120,164],"three":[121],"specialized":[122,160],"heads:":[123],"object":[124],"grounding,":[125],"spatial":[126],"geometry,":[127],"and":[128,134,144,167,191],"temporal":[129],"skill":[130],"logic.":[131],"Across":[132],"simulation":[133],"real-robot":[135],"experiments,":[136],"GuidedVLA":[137],"improves":[138],"success":[139],"rates":[140],"in":[141],"both":[142],"in-domain":[143],"out-of-domain":[145],"settings":[146],"compared":[147],"strong":[149],"VLA":[150,193],"baselines.":[151],"Finally,":[152],"show":[154],"that":[155,168,178],"quality":[157],"factors":[161],"correlates":[162],"positively":[163],"task":[165],"performance":[166],"our":[169],"mechanism":[170],"yields":[171],"decoupled,":[172],"high-quality":[173],"results":[176],"suggest":[177],"explicitly":[179],"guiding":[180],"action-decoder":[181],"promising":[185],"direction":[186],"building":[188],"more":[189],"robust":[190],"models.":[194]},"counts_by_year":[],"updated_date":"2026-05-14T06:16:12.342656","created_date":"2026-05-14T00:00:00"}
