{"id":"https://openalex.org/W7154328448","doi":"https://doi.org/10.48550/arxiv.2604.11331","title":"Any 3D Scene is Worth 1K Tokens: 3D-Grounded Representation for Scene Generation at Scale","display_name":"Any 3D Scene is Worth 1K Tokens: 3D-Grounded Representation for Scene Generation at Scale","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154328448","doi":"https://doi.org/10.48550/arxiv.2604.11331"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11331","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11331","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11331","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133569526","display_name":"Dongxu Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Dongxu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133622716","display_name":"Qi Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Qi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133561619","display_name":"Zhiqi Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zhiqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009482239","display_name":"Hangning Zhou","orcid":"https://orcid.org/0009-0009-8894-3137"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Hangning","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133588364","display_name":"Cong Qiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiu, Cong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133562507","display_name":"Hailong Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Hailong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133605245","display_name":"Mu Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Mu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133618143","display_name":"Zhaopeng Cui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cui, Zhaopeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133614493","display_name":"Peidong Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Peidong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.4462999999523163,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.4462999999523163,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.17499999701976776,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.1678999960422516,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6141999959945679},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5228000283241272},{"id":"https://openalex.org/keywords/extrapolation","display_name":"Extrapolation","score":0.45010000467300415},{"id":"https://openalex.org/keywords/view-synthesis","display_name":"View synthesis","score":0.4180999994277954},{"id":"https://openalex.org/keywords/3d-reconstruction","display_name":"3D reconstruction","score":0.41179999709129333},{"id":"https://openalex.org/keywords/3d-modeling","display_name":"3D modeling","score":0.38190001249313354},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.3752000033855438},{"id":"https://openalex.org/keywords/scale-space","display_name":"Scale space","score":0.3752000033855438}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6485000252723694},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6366000175476074},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6141999959945679},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6111999750137329},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5228000283241272},{"id":"https://openalex.org/C132459708","wikidata":"https://www.wikidata.org/wiki/Q744069","display_name":"Extrapolation","level":2,"score":0.45010000467300415},{"id":"https://openalex.org/C2776449333","wikidata":"https://www.wikidata.org/wiki/Q7928781","display_name":"View synthesis","level":3,"score":0.4180999994277954},{"id":"https://openalex.org/C109950114","wikidata":"https://www.wikidata.org/wiki/Q4464732","display_name":"3D reconstruction","level":2,"score":0.41179999709129333},{"id":"https://openalex.org/C2777897806","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3D modeling","level":2,"score":0.38190001249313354},{"id":"https://openalex.org/C99102927","wikidata":"https://www.wikidata.org/wiki/Q3058184","display_name":"Scale space","level":4,"score":0.3752000033855438},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3752000033855438},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.3675000071525574},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.34290000796318054},{"id":"https://openalex.org/C3019007443","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3d model","level":2,"score":0.3346000015735626},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3188000023365021},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.3158999979496002},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.30379998683929443},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2962999939918518},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.29589998722076416},{"id":"https://openalex.org/C197654239","wikidata":"https://www.wikidata.org/wiki/Q7430757","display_name":"Scene statistics","level":3,"score":0.287200003862381},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.2782999873161316},{"id":"https://openalex.org/C205372480","wikidata":"https://www.wikidata.org/wiki/Q210521","display_name":"Image resolution","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2583000063896179}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11331","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11331","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11331","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11331","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"3D":[0,24,35,60,74,99,112,119,136,149,155,176,186,195,210],"scene":[1,113,196,211],"generation":[2,114,197],"has":[3],"long":[4],"been":[5],"dominated":[6],"by":[7],"2D":[8,52,64,77,90,130,143],"multi-view":[9,43],"or":[10,45],"video":[11],"diffusion":[12,53,182,230],"models.":[13],"This":[14,152],"is":[15,234],"due":[16],"not":[17],"only":[18],"to":[19,29,63,80,110,122,133,217],"the":[20,30,40,93,97,107],"lack":[21],"of":[22,42,96,161],"scene-level":[23,34],"latent":[25,86,120,150,187],"representation,":[26,212],"but":[27],"also":[28],"fact":[31],"that":[32],"most":[33],"visual":[36],"data":[37],"exists":[38],"in":[39,89,184,236],"form":[41],"images":[44,218],"videos,":[46],"which":[47,67,140,180,233],"are":[48],"naturally":[49],"compatible":[50],"with":[51],"architectures.":[54],"Typically,":[55],"these":[56,124],"2D-based":[57,237],"approaches":[58],"degrade":[59],"spatial":[61,94],"extrapolation":[62],"temporal":[65],"extension,":[66],"introduces":[68],"two":[69],"fundamental":[70],"issues:":[71],"(i)":[72],"representing":[73,154],"scenes":[75,156],"via":[76],"views":[78],"leads":[79],"significant":[81],"representation":[82,131],"redundancy,":[83],"and":[84,165,170,192,219],"(ii)":[85],"space":[87,121],"rooted":[88],"inherently":[91],"limits":[92],"consistency":[95],"generated":[98],"scenes.":[100],"In":[101],"this":[102,185],"paper,":[103],"we":[104,127,174],"propose,":[105],"for":[106],"first":[108],"time,":[109],"perform":[111],"directly":[115,207],"within":[116],"an":[117],"implicit":[118],"address":[123],"limitations.":[125],"First,":[126],"repurpose":[128],"frozen":[129],"encoders":[132],"construct":[134],"our":[135,205],"Representation":[137],"Autoencoder":[138],"(3DRAE),":[139],"grounds":[141],"view-coupled":[142],"semantic":[144],"representations":[145],"into":[146],"a":[147,209],"view-decoupled":[148],"representation.":[151],"enables":[153],"observed":[157],"from":[158],"arbitrary":[159,224],"numbers":[160],"views--at":[162],"any":[163],"resolution":[164],"aspect":[166],"ratio--with":[167],"fixed":[168],"complexity":[169],"rich":[171],"semantics.":[172],"Then":[173],"introduce":[175],"Diffusion":[177],"Transformer":[178],"(3DDiT),":[179],"performs":[181],"modeling":[183],"space,":[188],"achieving":[189],"remarkably":[190],"efficient":[191],"spatially":[193],"consistent":[194],"while":[198],"supporting":[199],"diverse":[200],"conditioning":[201],"configurations.":[202],"Moreover,":[203],"since":[204],"approach":[206],"generates":[208],"it":[213],"can":[214],"be":[215],"decoded":[216],"optional":[220],"point":[221],"maps":[222],"along":[223],"camera":[225],"trajectories":[226],"without":[227],"requiring":[228],"per-trajectory":[229],"sampling":[231],"pass,":[232],"common":[235],"approaches.":[238]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-04-15T00:00:00"}
