{"id":"https://openalex.org/W7162762456","doi":"https://doi.org/10.48550/arxiv.2605.30282","title":"Gaze2Act: Gaze-Conditioned Vision-Language-Action Policies for Interactive Robot Manipulation","display_name":"Gaze2Act: Gaze-Conditioned Vision-Language-Action Policies for Interactive Robot Manipulation","publication_year":2026,"publication_date":"2026-05-28","ids":{"openalex":"https://openalex.org/W7162762456","doi":"https://doi.org/10.48550/arxiv.2605.30282"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.30282","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.30282","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.30282","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027489502","display_name":"Kuangji Zuo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zuo, Kuangji","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137340516","display_name":"Gen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Gen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101307843","display_name":"Bofan Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Bofan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129944976","display_name":"Yanshuo Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Yanshuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137345710","display_name":"Boyu Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Boyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137341024","display_name":"Shijia Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Shijia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137362628","display_name":"Xinyu Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Xinyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100986933","display_name":"Xichen Yuan","orcid":"https://orcid.org/0000-0002-9111-063X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Xichen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137372659","display_name":"Chuhao Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Chuhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137364804","display_name":"Jiaqi Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Jiaqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137360290","display_name":"Geng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Geng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137332650","display_name":"Jianfei Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jianfei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.2985000014305115,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.2985000014305115,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.25600001215934753,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11707","display_name":"Gaze Tracking and Assistive Technology","score":0.14489999413490295,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/gaze","display_name":"Gaze","score":0.7904000282287598},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6711000204086304},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.6237000226974487},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6079000234603882},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.6033999919891357},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.5055000185966492},{"id":"https://openalex.org/keywords/human\u2013robot-interaction","display_name":"Human\u2013robot interaction","score":0.48089998960494995}],"concepts":[{"id":"https://openalex.org/C2779916870","wikidata":"https://www.wikidata.org/wiki/Q14467155","display_name":"Gaze","level":2,"score":0.7904000282287598},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7728000283241272},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6711000204086304},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.670799970626831},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.6237000226974487},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6119999885559082},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6079000234603882},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.6033999919891357},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.5055000185966492},{"id":"https://openalex.org/C145460709","wikidata":"https://www.wikidata.org/wiki/Q859951","display_name":"Human\u2013robot interaction","level":3,"score":0.48089998960494995},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.4618000090122223},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4302000105381012},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.35109999775886536},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3269999921321869},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.32100000977516174},{"id":"https://openalex.org/C162947575","wikidata":"https://www.wikidata.org/wiki/Q2005645","display_name":"Social robot","level":5,"score":0.2865999937057495},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.27970001101493835}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.30282","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.30282","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.30282","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.30282","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language-Action":[0],"(VLA)":[1],"models":[2],"have":[3],"recently":[4],"shown":[5],"strong":[6],"potential":[7],"for":[8,78,110,199],"robot":[9,130],"learning":[10],"by":[11,89],"following":[12],"language":[13,18],"instructions.":[14],"However,":[15],"in":[16,164,176],"practice,":[17],"alone":[19],"is":[20,29],"often":[21],"insufficient":[22],"to":[23,31,36,43,131,133],"precisely":[24],"convey":[25],"human":[26,69,189],"intent.":[27,142],"It":[28,172],"difficult":[30],"describe":[32],"which":[33],"exact":[34],"object":[35,104,177],"interact":[37],"with":[38],"among":[39],"similar":[40],"candidates,":[41],"where":[42],"act":[44],"on":[45,155],"the":[46,50,85,94,120,129],"object,":[47],"or":[48],"how":[49],"target":[51,112],"may":[52],"change":[53],"during":[54],"execution.":[55],"To":[56],"address":[57],"this":[58],"limitation,":[59],"we":[60],"propose":[61],"Gaze2Act,":[62],"a":[63,72,107,144,156,192],"novel":[64],"VLA":[65,201],"framework":[66],"that":[67,188],"leverages":[68],"gaze":[70,92,108,190],"as":[71],"dynamic":[73,141,182],"and":[74,106,125,136,151,168,181,195],"intuitive":[75],"intent":[76,166,183],"signal":[77],"complex":[79],"interactive":[80],"manipulation.":[81],"Gaze2Act":[82,160],"first":[83],"bridges":[84],"ego-exo":[86],"view":[87],"gap":[88],"mapping":[90],"first-person":[91],"into":[93,119],"robot's":[95],"perspective":[96],"through":[97,122],"cross-view":[98],"semantic":[99],"matching,":[100],"producing":[101],"both":[102,165],"an":[103],"mask":[105],"point":[109],"coarse-to-fine":[111],"specification.":[113],"These":[114,185],"cues":[115],"are":[116],"then":[117],"integrated":[118],"policy":[121],"perception-level":[123],"prompting":[124],"action-level":[126],"conditioning,":[127],"allowing":[128],"attend":[132],"relevant":[134],"regions":[135],"execute":[137],"precise":[138],"interactions":[139],"under":[140],"In":[143],"systematic":[145],"evaluation":[146],"across":[147],"seven":[148],"task":[149,169],"categories":[150],"16":[152],"real-robot":[153],"tasks":[154],"Unitree":[157],"G1":[158],"humanoid,":[159],"achieves":[161],"state-of-the-art":[162],"performance":[163],"accuracy":[167],"success":[170],"rate.":[171],"notably":[173],"outperforms":[174],"baselines":[175],"disambiguation,":[178],"fine-grained":[179],"interaction,":[180],"steering.":[184],"results":[186],"demonstrate":[187],"provides":[191],"natural,":[193],"low-burden,":[194],"highly":[196],"expressive":[197],"modality":[198],"human-in-the-loop":[200],"control.":[202]},"counts_by_year":[],"updated_date":"2026-07-01T08:55:40.977307","created_date":"2026-05-30T00:00:00"}
