{"id":"https://openalex.org/W7160667478","doi":"https://doi.org/10.48550/arxiv.2605.05714","title":"TriRelVLA: Triadic Relational Structure for Generalizable Embodied Manipulation","display_name":"TriRelVLA: Triadic Relational Structure for Generalizable Embodied Manipulation","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160667478","doi":"https://doi.org/10.48550/arxiv.2605.05714"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.05714","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05714","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.05714","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135648502","display_name":"Hanyu Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Hanyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048294956","display_name":"Chuanhao Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Chuanhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135685248","display_name":"Gim Hee Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Gim Hee","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8680999875068665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8680999875068665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.028300000354647636,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.023499999195337296,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.6100999712944031},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5756000280380249},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.45559999346733093},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.44690001010894775},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.44359999895095825},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.4219000041484833},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.41269999742507935},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4023999869823456},{"id":"https://openalex.org/keywords/transferability","display_name":"Transferability","score":0.39239999651908875},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.3833000063896179}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6909999847412109},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6205999851226807},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.6100999712944031},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5756000280380249},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.45559999346733093},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.44690001010894775},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.44359999895095825},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.4219000041484833},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.41269999742507935},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4023999869823456},{"id":"https://openalex.org/C61272859","wikidata":"https://www.wikidata.org/wiki/Q7834031","display_name":"Transferability","level":3,"score":0.39239999651908875},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.3833000063896179},{"id":"https://openalex.org/C177877439","wikidata":"https://www.wikidata.org/wiki/Q7604413","display_name":"Statistical relational learning","level":3,"score":0.3668000102043152},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3589000105857849},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.3353999853134155},{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.326200008392334},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.31869998574256897},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.310699999332428},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.30709999799728394},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3005000054836273},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2962000072002411},{"id":"https://openalex.org/C40207289","wikidata":"https://www.wikidata.org/wiki/Q755662","display_name":"Relational model","level":3,"score":0.2791999876499176},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.2754000127315521},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.2669000029563904},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C52268767","wikidata":"https://www.wikidata.org/wiki/Q1248245","display_name":"Relational theory","level":2,"score":0.257099986076355},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2533000111579895},{"id":"https://openalex.org/C103683099","wikidata":"https://www.wikidata.org/wiki/Q5370102","display_name":"Embodied agent","level":3,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.05714","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05714","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.05714","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05714","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-language-action":[0],"(VLA)":[1],"models":[2,148],"perform":[3,154],"well":[4],"on":[5,81,98,181,206],"training-seen":[6],"robotic":[7,198],"tasks":[8,208],"but":[9],"struggle":[10],"to":[11,13,39,72],"generalize":[12],"unseen":[14],"scenes":[15],"and":[16,32,94,143,167,184,190,209,215],"objects.":[17],"A":[18],"key":[19],"limitation":[20],"lies":[21],"in":[22,212],"their":[23],"implicit":[24],"visual":[25,40,52],"representations,":[26],"which":[27,86],"entangle":[28],"object":[29,95],"appearance,":[30],"background,":[31],"scene":[33,59],"layout.":[34],"This":[35,175],"makes":[36],"policies":[37],"sensitive":[38],"variations.":[41],"Prior":[42],"work":[43],"improves":[44],"transferability":[45],"through":[46],"structured":[47],"intermediate":[48],"representations":[49,56,125],"that":[50,77],"objectify":[51],"content.":[53],"However,":[54],"these":[55],"mainly":[57],"capture":[58],"semantics":[60],"instead":[61],"of":[62,116],"action-relevant":[63],"relations.":[64],"As":[65],"a":[66,104,135,144,164,196],"result,":[67],"action":[68,156,173],"prediction":[69],"remains":[70],"tied":[71],"appearance":[73,182],"statistics.":[74],"We":[75,120,133,153,193],"observe":[76],"manipulation":[78],"actions":[79],"depend":[80],"the":[82,170],"object-hand-task":[83,123],"relational":[84,106,130,137,159,177],"structure,":[85],"governs":[87],"interactions":[88,149],"among":[89,150],"task":[90,191],"requirements,":[91],"robot":[92],"states,":[93],"properties.":[96],"Based":[97],"this":[99],"observation,":[100],"we":[101],"propose":[102],"TriRelVLA,":[103],"triadic":[105,124,176],"VLA":[107],"framework":[108],"for":[109,172,200],"generalizable":[110],"embodied":[111],"manipulation.":[112],"Our":[113],"approach":[114],"consists":[115],"three":[117],"components:":[118],"1)":[119],"construct":[121],"explicit":[122],"from":[126],"multimodal":[127],"inputs":[128],"as":[129],"primitives.":[131],"2)":[132],"build":[134],"task-grounded":[136],"graph.":[138],"Task-guided":[139],"cross-attention":[140],"forms":[141],"nodes,":[142],"relation-aware":[145],"graph":[146],"transformer":[147],"them.":[151],"3)":[152],"relation-conditioned":[155],"generation.":[157],"The":[158],"structure":[160],"is":[161],"compressed":[162],"into":[163,169],"bottleneck":[165,178],"space":[166],"projected":[168],"LLM":[171],"prediction.":[174],"reduces":[179],"reliance":[180],"statistics":[183],"enables":[185],"transfer":[186],"across":[187],"scenes,":[188],"objects,":[189],"compositions.":[192],"further":[194],"introduce":[195],"real-world":[197],"dataset":[199],"fine-tuning.":[201],"Experiments":[202],"show":[203],"strong":[204],"performance":[205],"fine-tuned":[207],"clear":[210],"gains":[211],"cross-scene,":[213],"cross-object,":[214],"cross-task":[216],"generalization.":[217]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-09T00:00:00"}
