{"id":"https://openalex.org/W7164862275","doi":"https://doi.org/10.1145/3805622.3810588","title":"VAR-3D: View-aware Auto-Regressive Model for Text-to-3D Generation via a 3D Tokenizer","display_name":"VAR-3D: View-aware Auto-Regressive Model for Text-to-3D Generation via a 3D Tokenizer","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164862275","doi":"https://doi.org/10.1145/3805622.3810588"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810588","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810588","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810588","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126217787","display_name":"Zongcheng Han","orcid":null},"institutions":[{"id":"https://openalex.org/I3923682","display_name":"Soochow University","ror":"https://ror.org/05t8y2r12","country_code":"CN","type":"education","lineage":["https://openalex.org/I3923682"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zongcheng Han","raw_affiliation_strings":["School of Computer Science and Technology, Soochow University, Suzhou, China"],"raw_orcid":"https://orcid.org/0009-0000-3477-9778","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Soochow University, Suzhou, China","institution_ids":["https://openalex.org/I3923682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100609882","display_name":"Yu Hong","orcid":"https://orcid.org/0000-0003-0606-3718"},"institutions":[{"id":"https://openalex.org/I3923682","display_name":"Soochow University","ror":"https://ror.org/05t8y2r12","country_code":"CN","type":"education","lineage":["https://openalex.org/I3923682"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Hong","raw_affiliation_strings":["School of Computer Science and Technology, Soochow University, Suzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-0606-3718","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Soochow University, Suzhou, China","institution_ids":["https://openalex.org/I3923682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101919343","display_name":"Haoran Sun","orcid":"https://orcid.org/0000-0002-1006-0176"},"institutions":[{"id":"https://openalex.org/I3923682","display_name":"Soochow University","ror":"https://ror.org/05t8y2r12","country_code":"CN","type":"education","lineage":["https://openalex.org/I3923682"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoran Sun","raw_affiliation_strings":["School of Computer Science and Technology, Soochow University, Suzhou, China"],"raw_orcid":"https://orcid.org/0009-0006-8072-0308","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Soochow University, Suzhou, China","institution_ids":["https://openalex.org/I3923682"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101714131","display_name":"Dongyan Cao","orcid":"https://orcid.org/0000-0001-8808-7929"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongyan Cao","raw_affiliation_strings":["Suzhou Research Institute, Harbin Institute of Technology, Suzhou, China and Research Center for Social Computing and Interactive Robotics, Harbin Institute of Technology, Harbin, China"],"raw_orcid":"https://orcid.org/0009-0008-6857-5333","affiliations":[{"raw_affiliation_string":"Suzhou Research Institute, Harbin Institute of Technology, Suzhou, China and Research Center for Social Computing and Interactive Robotics, Harbin Institute of Technology, Harbin, China","institution_ids":["https://openalex.org/I204983213"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.94087049,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1231","last_page":"1240"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.4433000087738037,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.4433000087738037,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.30889999866485596,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.06679999828338623,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/view-synthesis","display_name":"View synthesis","score":0.6021999716758728},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.499099999666214},{"id":"https://openalex.org/keywords/vector-quantization","display_name":"Vector quantization","score":0.44920000433921814},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.43970000743865967},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.39980000257492065},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.3741999864578247},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.3718999922275543},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.36660000681877136}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.64410001039505},{"id":"https://openalex.org/C2776449333","wikidata":"https://www.wikidata.org/wiki/Q7928781","display_name":"View synthesis","level":3,"score":0.6021999716758728},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.499099999666214},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47209998965263367},{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.44920000433921814},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.43970000743865967},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.39980000257492065},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.3741999864578247},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3718999922275543},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.36660000681877136},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.35830000042915344},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.33660000562667847},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.33250001072883606},{"id":"https://openalex.org/C126780896","wikidata":"https://www.wikidata.org/wiki/Q899871","display_name":"Distortion (music)","level":4,"score":0.32519999146461487},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.3174999952316284},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.3100999891757965},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2930999994277954},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.2849999964237213},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.2842999994754791},{"id":"https://openalex.org/C3019007443","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3d model","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2791000008583069},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.27390000224113464},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.27219998836517334}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810588","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810588","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810588","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810588","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G236580309","display_name":null,"funder_award_id":"62376182","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W2133665775","https://openalex.org/W2962785568","https://openalex.org/W3180355996","https://openalex.org/W4303448003","https://openalex.org/W4312933868","https://openalex.org/W4312974539","https://openalex.org/W4313021454","https://openalex.org/W4385275714","https://openalex.org/W4386066388","https://openalex.org/W4386075660","https://openalex.org/W4390872297","https://openalex.org/W4390873542","https://openalex.org/W4390874424","https://openalex.org/W4400822286","https://openalex.org/W4402733585","https://openalex.org/W4402754301","https://openalex.org/W4402951569","https://openalex.org/W4402951629","https://openalex.org/W4403878921","https://openalex.org/W4403998223","https://openalex.org/W4412588123","https://openalex.org/W4413145881","https://openalex.org/W4413156287","https://openalex.org/W4415796338","https://openalex.org/W4415796345","https://openalex.org/W7133190699","https://openalex.org/W7133196460","https://openalex.org/W7133205559","https://openalex.org/W7133211372","https://openalex.org/W7133220370","https://openalex.org/W7160051695","https://openalex.org/W7160193996","https://openalex.org/W7160196396"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,9,21,147],"auto-regressive":[3,74],"transformers":[4],"have":[5],"achieved":[6],"remarkable":[7],"success":[8],"generative":[10,124],"modeling.":[11],"However,":[12],"text-to-3D":[13],"generation":[14,149],"remains":[15],"challenging,":[16],"primarily":[17],"due":[18],"to":[19,95,126,135],"bottlenecks":[20],"learning":[22],"discrete":[23,105,116],"3D":[24,58,84,90,102],"representations.":[25],"Specifically,":[26],"existing":[27,145],"approaches":[28],"often":[29],"suffer":[30],"from":[31],"information":[32],"loss":[33],"during":[34],"encoding,":[35],"causing":[36],"representational":[37],"distortion":[38],"before":[39],"the":[40,53,61,97,123,136],"quantization":[41],"process.":[42],"This":[43],"effect":[44],"is":[45],"further":[46],"amplified":[47],"by":[48],"vector":[49],"quantization,":[50],"ultimately":[51],"degrading":[52],"geometric":[54,99],"coherence":[55],"of":[56,101],"text-conditioned":[57,73],"shapes.":[59],"Moreover,":[60],"conventional":[62],"two-stage":[63],"training":[64,112],"paradigm":[65],"induces":[66],"an":[67],"objective":[68],"mismatch":[69],"between":[70],"reconstruction":[71],"and":[72,131,151],"generation.":[75],"To":[76],"address":[77],"these":[78],"issues,":[79],"we":[80,108],"propose":[81],"View-aware":[82],"Auto-Regressive":[83],"(VAR-3D),":[85],"which":[86],"intergrates":[87],"a":[88,110],"view-aware":[89],"Vector":[91],"Quantized-Variational":[92],"AutoEncoder":[93],"(VQ-VAE)":[94],"convert":[96],"complex":[98],"structure":[100],"models":[103],"into":[104],"tokens.":[106],"Additionally,":[107],"introduce":[109],"rendering-supervised":[111],"strategy":[113],"that":[114,141],"couples":[115],"token":[117],"prediction":[118],"with":[119],"visual":[120,129],"reconstruction,":[121],"encouraging":[122],"process":[125],"better":[127],"preserve":[128],"fidelity":[130],"structural":[132],"consistency":[133],"relative":[134],"input":[137],"text.":[138],"Experiments":[139],"demonstrate":[140],"VAR-3D":[142],"significantly":[143],"outperforms":[144],"methods":[146],"both":[148],"quality":[150],"text-3D":[152],"alignment.":[153]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
