{"id":"https://openalex.org/W7162566681","doi":"https://doi.org/10.1109/3dv69130.2026.00177","title":"3D-GENERALIST: Vision-Language-Action Models for Crafting 3D Worlds","display_name":"3D-GENERALIST: Vision-Language-Action Models for Crafting 3D Worlds","publication_year":2026,"publication_date":"2026-03-20","ids":{"openalex":"https://openalex.org/W7162566681","doi":"https://doi.org/10.1109/3dv69130.2026.00177"},"language":null,"primary_location":{"id":"doi:10.1109/3dv69130.2026.00177","is_oa":false,"landing_page_url":"https://doi.org/10.1109/3dv69130.2026.00177","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 International Conference on 3D Vision (3DV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034488596","display_name":"Fan-Yun Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Fan-Yun Sun","raw_affiliation_strings":["Stanford University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137087622","display_name":"Shengguang Wu","orcid":null},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shengguang Wu","raw_affiliation_strings":["Stanford University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137124905","display_name":"Christian Jacobsen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Christian Jacobsen","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072730292","display_name":"Thomas Yim","orcid":null},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Thomas Yim","raw_affiliation_strings":["Stanford University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012714850","display_name":"Haoming Zou","orcid":null},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haoming Zou","raw_affiliation_strings":["Stanford University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137175460","display_name":"Alex Zook","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Alex Zook","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137103386","display_name":"Shangru Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shangru Li","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137182176","display_name":"Yu-Hsin Chou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu-Hsin Chou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137179889","display_name":"Ethem Can","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ethem Can","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137093735","display_name":"Xunlei Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xunlei Wu","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076300410","display_name":"Clemens Eppner","orcid":"https://orcid.org/0000-0002-5398-4037"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Clemens Eppner","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062182459","display_name":"Valts Blukis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Valts Blukis","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137152770","display_name":"Jonathan Tremblay","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jonathan Tremblay","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137104869","display_name":"Jiajun Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiajun Wu","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133604350","display_name":"Stan Birchfield","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stan Birchfield","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069105490","display_name":"Nick Haber","orcid":"https://orcid.org/0000-0001-8804-7804"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nick Haber","raw_affiliation_strings":["Stanford University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":16,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.83328888,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1873","last_page":"1883"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.11590000241994858,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.11590000241994858,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10789","display_name":"Interactive and Immersive Displays","score":0.058400001376867294,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.04540000110864639,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.27950000762939453},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.2791999876499176},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.26750001311302185},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.26589998602867126},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.26019999384880066}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4553000032901764},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.3109999895095825},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.29100000858306885},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.2791999876499176},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26589998602867126},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.25529998540878296},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/3dv69130.2026.00177","is_oa":false,"landing_page_url":"https://doi.org/10.1109/3dv69130.2026.00177","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 International Conference on 3D Vision (3DV)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W2009188425","https://openalex.org/W2047157868","https://openalex.org/W2068676460","https://openalex.org/W2108598243","https://openalex.org/W2162559028","https://openalex.org/W2250673545","https://openalex.org/W2902539442","https://openalex.org/W2962735233","https://openalex.org/W2963433827","https://openalex.org/W3097660860","https://openalex.org/W4386075660","https://openalex.org/W4386607324","https://openalex.org/W4390874211","https://openalex.org/W4399563697","https://openalex.org/W4399564040","https://openalex.org/W4402354170","https://openalex.org/W4402660086","https://openalex.org/W4402660140","https://openalex.org/W4402703065","https://openalex.org/W4402704625","https://openalex.org/W4402716288","https://openalex.org/W4402727120","https://openalex.org/W4402753875","https://openalex.org/W4402781583","https://openalex.org/W4403081614","https://openalex.org/W4412673735","https://openalex.org/W4413147469","https://openalex.org/W4415796350","https://openalex.org/W4415800665","https://openalex.org/W7133190492","https://openalex.org/W7133197571","https://openalex.org/W7133246057"],"related_works":[],"abstract_inverted_index":{"Creating":[0],"3D":[1,52,73,89,107],"graphics":[2],"content":[3],"for":[4,19,116],"immersive":[5],"and":[6,39,78,100,114,147],"interactive":[7],"worlds":[8],"remains":[9],"labor-intensive,":[10],"limiting":[11],"our":[12,101],"ability":[13],"to":[14,26,69,85],"create":[15],"large-scale":[16],"synthetic":[17,117,145],"data":[18,118,146],"training":[20,102,152],"foundation":[21,124],"models.":[22],"Recent":[23],"methods":[24],"aim":[25],"alleviate":[27],"this,":[28],"but":[29],"they":[30],"often":[31],"focus":[32],"on":[33,126,132,142],"a":[34,56,72,122],"single":[35],"aspect":[36],"(e.g.,":[37],"layout)":[38],"do":[40],"not":[41],"improve":[42],"generation":[43,54,119],"quality":[44,113],"by":[45,120],"simply":[46],"scaling":[47],"computational":[48],"resources.":[49],"We":[50,94,109],"recast":[51],"environment":[53],"as":[55,64],"sequential":[57],"decision-making":[58],"problem,":[59],"using":[60],"Vision-Language":[61],"Models":[62],"(VLMs)":[63],"policies":[65],"that":[66,137],"output":[67],"actions":[68],"jointly":[70],"craft":[71],"environment's":[74],"layout,":[75],"materials,":[76],"lighting,":[77],"assets.":[79],"Our":[80],"framework,":[81],"3D-Generalist,":[82],"trains":[83],"VLMs":[84],"generate":[86],"more":[87],"prompt-aligned":[88],"environments":[90],"via":[91],"self-improvement":[92],"fine-tuning.":[93],"demonstrate":[95,111],"the":[96,127],"effectiveness":[97],"of":[98,155],"3D-Generalist":[99],"strategy":[103],"in":[104],"generating":[105],"simulation-ready":[106],"environments.":[108],"also":[110],"its":[112],"scalability":[115],"pretraining":[121],"vision":[123],"model":[125],"generated":[128],"data.":[129,159],"After":[130],"fine-tuning":[131],"downstream":[133],"tasks,":[134],"we":[135],"show":[136],"it":[138],"surpasses":[139],"models":[140],"pre-trained":[141],"meticulously":[143],"human-crafted":[144],"approaches":[148],"results":[149],"achieved":[150],"when":[151],"with":[153],"orders":[154],"magnitude":[156],"larger":[157],"real":[158]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-28T00:00:00"}
