{"id":"https://openalex.org/W4387560383","doi":"https://doi.org/10.48550/arxiv.2310.05737","title":"Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation","display_name":"Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation","publication_year":2023,"publication_date":"2023-10-09","ids":{"openalex":"https://openalex.org/W4387560383","doi":"https://doi.org/10.48550/arxiv.2310.05737"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2310.05737","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.05737","pdf_url":"https://arxiv.org/pdf/2310.05737","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2310.05737","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056705600","display_name":"Lijun Yu","orcid":"https://orcid.org/0000-0003-0645-1657"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Lijun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102805247","display_name":"Jos\u00e9 Lezama","orcid":"https://orcid.org/0009-0002-2936-875X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lezama, Jos\u00e9","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067011413","display_name":"Nitesh B. Gundavarapu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gundavarapu, Nitesh B.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090111536","display_name":"Luca Versari","orcid":"https://orcid.org/0000-0003-3495-1325"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Versari, Luca","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072573916","display_name":"Kihyuk Sohn","orcid":"https://orcid.org/0000-0003-4303-8319"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sohn, Kihyuk","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034317020","display_name":"David Minnen","orcid":"https://orcid.org/0000-0002-4235-5630"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Minnen, David","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100349974","display_name":"Yong Cheng","orcid":"https://orcid.org/0000-0001-9043-8302"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Yong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Birodkar, Vighnesh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Birodkar, Vighnesh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079573718","display_name":"Agrim Gupta","orcid":"https://orcid.org/0000-0002-5213-0224"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gupta, Agrim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046782091","display_name":"Xiuye Gu","orcid":"https://orcid.org/0000-0001-5568-564X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Xiuye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103099928","display_name":"Alexander G. Hauptmann","orcid":"https://orcid.org/0000-0003-2123-0684"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hauptmann, Alexander G.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017319429","display_name":"Boqing Gong","orcid":"https://orcid.org/0000-0003-3915-5977"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Boqing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100418319","display_name":"Ming\u2013Hsuan Yang","orcid":"https://orcid.org/0000-0003-4848-2304"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Ming-Hsuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070348998","display_name":"Irfan Essa","orcid":"https://orcid.org/0000-0002-6236-2969"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Essa, Irfan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087975493","display_name":"David A. Ross","orcid":"https://orcid.org/0000-0001-7426-9561"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ross, David A.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5029634208","display_name":"Lu Jiang","orcid":"https://orcid.org/0000-0003-2694-2015"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Lu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":16,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":21,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9878000020980835,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9878000020980835,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9444000124931335,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.9438999891281128,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7543232440948486},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5953940153121948},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.4994204044342041},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.4973912537097931},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4854017198085785},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.43914783000946045},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.43861401081085205},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.42938005924224854},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.41990065574645996},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.39372897148132324},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3662410378456116},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.135219007730484},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.10520553588867188},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.10512274503707886}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7543232440948486},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5953940153121948},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.4994204044342041},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.4973912537097931},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4854017198085785},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.43914783000946045},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.43861401081085205},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.42938005924224854},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41990065574645996},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.39372897148132324},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3662410378456116},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.135219007730484},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.10520553588867188},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.10512274503707886},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2310.05737","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.05737","pdf_url":"https://arxiv.org/pdf/2310.05737","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2310.05737","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2310.05737","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2310.05737","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.05737","pdf_url":"https://arxiv.org/pdf/2310.05737","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7799999713897705}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4387560383.pdf","grobid_xml":"https://content.openalex.org/works/W4387560383.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4365211920","https://openalex.org/W3014948380","https://openalex.org/W4380551139","https://openalex.org/W4317695495","https://openalex.org/W2280377497","https://openalex.org/W4387506531","https://openalex.org/W4238433571","https://openalex.org/W3174044702","https://openalex.org/W2967848559","https://openalex.org/W4283803360"],"abstract_inverted_index":{"While":[0],"Large":[1],"Language":[2],"Models":[3],"(LLMs)":[4],"are":[5],"the":[6,39,110,124],"dominant":[7],"models":[8,22,90],"for":[9,32,50,69,138],"generative":[10],"tasks":[11],"in":[12],"language,":[13],"they":[14],"do":[15],"not":[16],"perform":[17],"as":[18,20],"well":[19],"diffusion":[21,89],"on":[23,91,115],"image":[24,93],"and":[25,66,72,94,100,133],"video":[26,60,95,113,120,126],"generation.":[27],"To":[28],"effectively":[29],"use":[30],"LLMs":[31,87],"visual":[33,40],"generation,":[34],"one":[35],"crucial":[36],"component":[37],"is":[38],"tokenizer":[41,61,108,114],"that":[42,86,106],"maps":[43],"pixel-space":[44],"inputs":[45],"to":[46,63,123,130],"discrete":[47],"tokens":[48,68],"appropriate":[49],"LLM":[51],"learning.":[52],"In":[53,102],"this":[54,81],"paper,":[55],"we":[56,84,104],"introduce":[57],"MAGVIT-v2,":[58],"a":[59,75],"designed":[62],"generate":[64],"concise":[65],"expressive":[67],"both":[70],"videos":[71],"images":[73],"using":[74],"common":[76],"token":[77],"vocabulary.":[78],"Equipped":[79],"with":[80],"new":[82],"tokenizer,":[83],"show":[85],"outperform":[88],"standard":[92],"generation":[96],"benchmarks":[97],"including":[98],"ImageNet":[99],"Kinetics.":[101],"addition,":[103],"demonstrate":[105],"our":[107],"surpasses":[109],"previously":[111],"top-performing":[112],"two":[116],"more":[117],"tasks:":[118],"(1)":[119],"compression":[121],"comparable":[122],"next-generation":[125],"codec":[127],"(VCC)":[128],"according":[129],"human":[131],"evaluations,":[132],"(2)":[134],"learning":[135],"effective":[136],"representations":[137],"action":[139],"recognition":[140],"tasks.":[141]},"counts_by_year":[{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":11}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
