{"id":"https://openalex.org/W7140905192","doi":"https://doi.org/10.48550/arxiv.2603.24393","title":"3D-Mix for VLA: A Plug-and-Play Module for Integrating VGGT-based 3D Information into Vision-Language-Action Models","display_name":"3D-Mix for VLA: A Plug-and-Play Module for Integrating VGGT-based 3D Information into Vision-Language-Action Models","publication_year":2026,"publication_date":"2026-03-25","ids":{"openalex":"https://openalex.org/W7140905192","doi":"https://doi.org/10.48550/arxiv.2603.24393"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.24393","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.24393","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.24393","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130663148","display_name":"Bin Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yu, Bin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112968072","display_name":"Shijie Lian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lian, Shijie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130631901","display_name":"Xiaopeng Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Xiaopeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123471925","display_name":"Zhaolong Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Zhaolong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123480403","display_name":"Yuliang Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Yuliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102737567","display_name":"Haishan Liu","orcid":"https://orcid.org/0000-0002-0817-9928"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Haishan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069583018","display_name":"Changti Wu","orcid":"https://orcid.org/0009-0009-9448-6657"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Changti","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130717320","display_name":"Hang Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Hang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123456291","display_name":"Bailing Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Bailing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130666957","display_name":"Cong Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Cong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130702550","display_name":"Kai Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Kai","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5130663148"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9300000071525574,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9300000071525574,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.010700000450015068,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.006800000090152025,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7584999799728394},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5716999769210815},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5605999827384949},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.3944000005722046},{"id":"https://openalex.org/keywords/sensor-fusion","display_name":"Sensor fusion","score":0.36910000443458557},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.34310001134872437},{"id":"https://openalex.org/keywords/semantic-mapping","display_name":"Semantic mapping","score":0.32280001044273376},{"id":"https://openalex.org/keywords/information-integration","display_name":"Information integration","score":0.3197000026702881}],"concepts":[{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7584999799728394},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7077000141143799},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5857999920845032},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5716999769210815},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5605999827384949},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5052000284194946},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3944000005722046},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.36910000443458557},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.34310001134872437},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.32280001044273376},{"id":"https://openalex.org/C33326189","wikidata":"https://www.wikidata.org/wiki/Q17092450","display_name":"Information integration","level":2,"score":0.3197000026702881},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.31869998574256897},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.31700000166893005},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.31470000743865967},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.30799999833106995},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.2955000102519989},{"id":"https://openalex.org/C157170001","wikidata":"https://www.wikidata.org/wiki/Q4781507","display_name":"Applications of artificial intelligence","level":2,"score":0.28610000014305115},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.27810001373291016},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25929999351501465},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.25429999828338623},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2542000114917755},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2529999911785126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.24393","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.24393","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.24393","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.24393","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language-Action":[0],"(VLA)":[1],"models":[2,44],"leverage":[3],"Multimodal":[4],"Large":[5],"Language":[6],"Models":[7],"(MLLMs)":[8],"for":[9,34,174],"robotic":[10],"control,":[11],"but":[12],"recent":[13,38],"studies":[14],"reveal":[15],"that":[16,82,119,151],"MLLMs":[17],"exhibit":[18],"limited":[19],"spatial":[20,50,176],"intelligence":[21,177],"due":[22],"to":[23,48],"training":[24],"predominantly":[25],"on":[26,77,96,146,159],"2D":[27,89],"data,":[28],"resulting":[29],"in":[30,109,178],"inadequate":[31],"3D":[32,42,92],"perception":[33],"manipulation":[35],"tasks.":[36],"While":[37],"approaches":[39],"incorporate":[40],"specialized":[41],"vision":[43],"such":[45],"as":[46],"VGGT":[47,74],"enhance":[49],"understanding,":[51],"they":[52],"employ":[53],"diverse":[54,122],"integration":[55,75],"mechanisms":[56],"without":[57,128],"systematic":[58],"investigation,":[59],"leaving":[60],"the":[61,100,160],"optimal":[62],"fusion":[63,107],"strategy":[64],"unclear.":[65],"We":[66,113],"conduct":[67],"a":[68,116,171],"comprehensive":[69],"pilot":[70,111],"study":[71],"comparing":[72],"nine":[73,105,167],"schemes":[76,108],"standardized":[78],"benchmarks":[79],"and":[80,91,126,148],"find":[81],"semantic-conditioned":[83],"gated":[84],"fusion,":[85],"which":[86],"adaptively":[87],"balances":[88],"semantic":[90],"geometric":[93],"features":[94],"based":[95],"task":[97],"context,":[98],"achieved":[99],"strongest":[101],"performance":[102,155],"among":[103],"all":[104,166],"evaluated":[106],"our":[110],"study.":[112],"present":[114],"3D-Mix,":[115],"plug-and-play":[117],"module":[118],"integrates":[120],"into":[121],"VLA":[123,179],"architectures":[124],"(GR00T-style":[125],"$\u03c0$-style)":[127],"modifying":[129],"existing":[130],"MLLM":[131,139],"or":[132],"action":[133],"expert":[134],"components.":[135],"Experiments":[136],"across":[137,165],"six":[138],"series":[140],"(nine":[141],"model":[142],"variants,":[143,169],"2B--8B":[144],"parameters)":[145],"SIMPLER":[147,163],"LIBERO":[149],"show":[150],"3D-Mix":[152],"delivers":[153],"consistent":[154],"gains,":[156],"averaging":[157],"+7.0%":[158],"out-of-domain":[161],"(OOD)":[162],"benchmark":[164],"GR00T-style":[168],"establishing":[170],"principled":[172],"approach":[173],"enhancing":[175],"systems.":[180]},"counts_by_year":[],"updated_date":"2026-03-27T06:05:27.210665","created_date":"2026-03-27T00:00:00"}
