{"id":"https://openalex.org/W7161694506","doi":"https://doi.org/10.48550/arxiv.2605.18390","title":"Vision Foundation Models as Generalist Tokenizers for Image Generation","display_name":"Vision Foundation Models as Generalist Tokenizers for Image Generation","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161694506","doi":"https://doi.org/10.48550/arxiv.2605.18390"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.18390","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18390","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.18390","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023904965","display_name":"Anlin Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Anlin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136458488","display_name":"Qi Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Qi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136486799","display_name":"Xin Wen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008623498","display_name":"Chuofan Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Chuofan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100565808","display_name":"Lanxi Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Lanxi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136486625","display_name":"Gang Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Gang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136466907","display_name":"Xiangyu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xiangyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136464604","display_name":"Xiaojuan Qi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi, Xiaojuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9287999868392944,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9287999868392944,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11019","display_name":"Image Enhancement Techniques","score":0.010099999606609344,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0071000000461936,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5249999761581421},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.4814999997615814},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.448199987411499},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.42410001158714294},{"id":"https://openalex.org/keywords/grid","display_name":"Grid","score":0.42239999771118164},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.38960000872612},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.37439998984336853},{"id":"https://openalex.org/keywords/view-synthesis","display_name":"View synthesis","score":0.3684000074863434}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7184000015258789},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5598000288009644},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5249999761581421},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.4814999997615814},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.45509999990463257},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.448199987411499},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.42410001158714294},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.42239999771118164},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.38960000872612},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.37439998984336853},{"id":"https://openalex.org/C2776449333","wikidata":"https://www.wikidata.org/wiki/Q7928781","display_name":"View synthesis","level":3,"score":0.3684000074863434},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3630000054836273},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.3540000021457672},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.3499999940395355},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3310000002384186},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.31929999589920044},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.31520000100135803},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C2776825360","wikidata":"https://www.wikidata.org/wiki/Q1411921","display_name":"Vagueness","level":3,"score":0.2761000096797943},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2646999955177307},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C55020928","wikidata":"https://www.wikidata.org/wiki/Q3813865","display_name":"Image quality","level":3,"score":0.25380000472068787}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.18390","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18390","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.18390","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18390","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"this":[1,27],"work,":[2],"we":[3,29,79,181],"explore":[4],"the":[5,35,64,68,150,184,193,223,240],"largely":[6],"unexplored":[7],"direction":[8],"of":[9,18,87,126,146,187,242],"building":[10],"a":[11,19,31,43,58,82,123,139,206,209,232],"generalist":[12,83],"image":[13,220,227,244],"tokenizer":[14,85],"directly":[15],"on":[16,128],"top":[17],"frozen":[20,32],"vision":[21],"foundation":[22,234],"model":[23,116,141],"(VFM).":[24],"To":[25],"build":[26],"tokenizer,":[28],"utilize":[30],"VFM":[33,200,210],"as":[34,205],"encoder":[36],"and":[37,56,93,121,217,235],"introduce":[38],"two":[39],"key":[40],"innovations:":[41],"(1)":[42],"region-adaptive":[44],"quantization":[45],"framework":[46],"to":[47,71],"eliminate":[48],"spatial":[49,156],"redundancy":[50],"in":[51,76,90,101],"standard":[52],"2D":[53],"grid":[54],"features,":[55],"(2)":[57],"semantic":[59,73],"reconstruction":[60],"objective":[61],"that":[62,192],"aligns":[63],"decoded":[65],"outputs":[66],"with":[67,138,213],"VFM's":[69],"representations":[70,225],"preserve":[72],"fidelity.":[74],"Grounded":[75],"these":[77,177],"designs,":[78],"propose":[80],"VFMTok,":[81],"visual":[84],"capable":[86],"operating":[88],"seamlessly":[89],"both":[91,169],"discrete":[92,110],"continuous":[94],"latent":[95,151,218],"spaces.":[96],"VFMTok":[97,137,158],"achieves":[98,122],"substantial":[99],"improvements":[100],"synthesis":[102,162],"quality":[103],"while":[104],"drastically":[105],"enhancing":[106],"token":[107],"efficiency.":[108],"For":[109],"autoregressive":[111],"(AR)":[112],"generation,":[113,135],"it":[114],"accelerates":[115],"convergence":[117],"by":[118],"\\textbf{3":[119],"times}":[120],"state-of-the-art":[124],"gFID":[125,145],"\\textbf{1.36}":[127],"ImageNet":[129],"class-conditional":[130,161],"synthesis.":[131],"Similarly,":[132],"for":[133,226,239],"continuous-space":[134],"integrating":[136],"denoising":[140],"yields":[142],"an":[143],"exceptional":[144],"\\textbf{1.25}.":[147],"Furthermore,":[148],"because":[149],"space":[152],"inherently":[153],"captures":[154],"rich":[155],"semantics,":[157],"enables":[159],"high-fidelity":[160],"without":[163],"classifier-free":[164],"guidance":[165,238],"(\\textbf{w/o":[166],"CFG})":[167],"across":[168],"generative":[170],"paradigms,":[171],"significantly":[172],"accelerating":[173],"inference":[174],"speed.":[175],"Beyond":[176],"remarkable":[178],"empirical":[179],"results,":[180],"systematically":[182],"investigate":[183],"underlying":[185],"mechanisms":[186],"our":[188],"approach.":[189],"We":[190],"discover":[191],"specific":[194],"self-supervised":[195],"learning":[196,216],"objectives":[197],"utilized":[198],"during":[199],"pre-training":[201],"dictate":[202],"its":[203],"effectiveness":[204],"tokenizer.":[207],"Specifically,":[208],"jointly":[211],"optimized":[212],"global":[214],"contrastive":[215],"masked":[219],"modeling":[221],"provides":[222],"optimal":[224],"tokenization.":[228],"These":[229],"insights":[230],"establish":[231],"strong":[233],"offer":[236],"valuable":[237],"design":[241],"future":[243],"tokenizers.":[245]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-20T00:00:00"}
