{"id":"https://openalex.org/W4415059848","doi":"https://doi.org/10.48550/arxiv.2509.07957","title":"Graph-Fused Vision-Language-Action for Policy Reasoning in Multi-Arm Robotic Manipulation","display_name":"Graph-Fused Vision-Language-Action for Policy Reasoning in Multi-Arm Robotic Manipulation","publication_year":2025,"publication_date":"2025-09-09","ids":{"openalex":"https://openalex.org/W4415059848","doi":"https://doi.org/10.48550/arxiv.2509.07957"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2509.07957","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.07957","pdf_url":"https://arxiv.org/pdf/2509.07957","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2509.07957","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5055590426","display_name":"Shunlei Li","orcid":"https://orcid.org/0000-0002-2872-4217"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Shunlei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087982946","display_name":"Longsen Gao","orcid":"https://orcid.org/0009-0005-7993-7203"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Longsen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103172802","display_name":"Jiuwen Cao","orcid":"https://orcid.org/0000-0002-6088-3912"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Jiuwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101760892","display_name":"Yingbai Hu","orcid":"https://orcid.org/0000-0003-2452-3570"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Yingbai","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5055590426"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.7615000009536743,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.7615000009536743,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.5920000076293945},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.5232999920845032},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5160999894142151},{"id":"https://openalex.org/keywords/planner","display_name":"Planner","score":0.45820000767707825},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4205999970436096},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.40610000491142273},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.3806000053882599},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.3790000081062317}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7477999925613403},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6327000260353088},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.5920000076293945},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.5232999920845032},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5160999894142151},{"id":"https://openalex.org/C2776999362","wikidata":"https://www.wikidata.org/wiki/Q2349274","display_name":"Planner","level":2,"score":0.45820000767707825},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4205999970436096},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.40610000491142273},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.3806000053882599},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3790000081062317},{"id":"https://openalex.org/C119701452","wikidata":"https://www.wikidata.org/wiki/Q5165881","display_name":"Control reconfiguration","level":2,"score":0.3765000104904175},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3702999949455261},{"id":"https://openalex.org/C81074085","wikidata":"https://www.wikidata.org/wiki/Q366872","display_name":"Motion planning","level":3,"score":0.3490000069141388},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.34850001335144043},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.3321000039577484},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3312999904155731},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.3278999924659729},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3190999925136566},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C2985527887","wikidata":"https://www.wikidata.org/wiki/Q1587588","display_name":"Robot manipulator","level":3,"score":0.2757999897003174},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.26460000872612},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26460000872612},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26420000195503235},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.2615000009536743}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2509.07957","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.07957","pdf_url":"https://arxiv.org/pdf/2509.07957","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2509.07957","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.07957","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2509.07957","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.07957","pdf_url":"https://arxiv.org/pdf/2509.07957","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Acquiring":[0],"dexterous":[1],"robotic":[2,49],"skills":[3],"from":[4,58],"human":[5,60],"video":[6],"demonstrations":[7],"remains":[8],"a":[9,43,92,113,172],"significant":[10],"challenge,":[11],"largely":[12],"due":[13],"to":[14,24,51,67,95,163],"conventional":[15],"reliance":[16],"on":[17,130,171],"low-level":[18],"trajectory":[19],"replication,":[20],"which":[21,87],"often":[22],"fails":[23],"generalize":[25],"across":[26,189],"varying":[27],"objects,":[28],"spatial":[29,141,203],"layouts,":[30],"and":[31,55,75,100,140,155,184,192,199,204],"manipulator":[32],"configurations.":[33],"To":[34,105],"address":[35],"this":[36],"limitation,":[37],"we":[38,111],"introduce":[39],"Graph-Fused":[40],"Vision-Language-Action":[41],"(GF-VLA),":[42],"unified":[44],"framework":[45],"that":[46,117,146],"enables":[47],"dual-arm":[48,132,173],"systems":[50],"perform":[52],"task-level":[53],"reasoning":[54],"execution":[56],"directly":[57],"RGB-D":[59],"demonstrations.":[61],"GF-VLA":[62,129],"employs":[63],"an":[64],"information-theoretic":[65],"approach":[66],"extract":[68],"task-relevant":[69],"cues,":[70],"selectively":[71],"highlighting":[72],"critical":[73],"hand-object":[74],"object-object":[76],"interactions.":[77],"These":[78],"cues":[79],"are":[80,88],"structured":[81],"into":[82],"temporally":[83],"ordered":[84],"scene":[85],"graphs,":[86],"subsequently":[89],"integrated":[90],"with":[91],"language-conditioned":[93],"transformer":[94],"produce":[96],"hierarchical":[97],"behavior":[98],"trees":[99],"interpretable":[101,166],"Cartesian":[102],"motion":[103],"primitives.":[104],"enhance":[106],"efficiency":[107],"in":[108],"bimanual":[109],"execution,":[110],"propose":[112],"cross-arm":[114],"allocation":[115],"strategy":[116],"autonomously":[118],"determines":[119],"gripper":[120],"assignment":[121],"without":[122],"requiring":[123],"explicit":[124],"geometric":[125,193],"modeling.":[126],"We":[127],"validate":[128],"four":[131],"block":[133],"assembly":[134],"benchmarks":[135],"involving":[136],"symbolic":[137],"structure":[138],"construction":[139],"generalization.":[142],"Empirical":[143],"results":[144],"demonstrate":[145],"the":[147,160],"proposed":[148],"representation":[149],"achieves":[150],"over":[151],"95%":[152],"graph":[153],"accuracy":[154],"93%":[156],"subtask":[157],"segmentation,":[158],"enabling":[159],"language-action":[161],"planner":[162],"generate":[164],"robust,":[165],"task":[167,187],"policies.":[168],"When":[169],"deployed":[170],"robot,":[174],"these":[175],"policies":[176],"attain":[177],"94%":[178],"grasp":[179],"reliability,":[180],"89%":[181],"placement":[182],"accuracy,":[183],"90%":[185],"overall":[186],"success":[188],"stacking,":[190],"letter-formation,":[191],"reconfiguration":[194],"tasks,":[195],"evidencing":[196],"strong":[197],"generalization":[198],"robustness":[200],"under":[201],"diverse":[202],"semantic":[205],"variations.":[206]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-11T00:00:00"}
