{"id":"https://openalex.org/W7160095423","doi":"https://doi.org/10.48550/arxiv.2605.00503","title":"End-to-End Autoregressive Image Generation with 1D Semantic Tokenizer","display_name":"End-to-End Autoregressive Image Generation with 1D Semantic Tokenizer","publication_year":2026,"publication_date":"2026-05-01","ids":{"openalex":"https://openalex.org/W7160095423","doi":"https://doi.org/10.48550/arxiv.2605.00503"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.00503","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.00503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.00503","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052048154","display_name":"Wenda Chu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chu, Wenda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048847917","display_name":"Bingliang Zhang","orcid":"https://orcid.org/0000-0002-9318-3680"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Bingliang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135271829","display_name":"Jiaqi Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Jiaqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135115958","display_name":"Yizhuo Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yizhuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042544148","display_name":"Linjie Yang","orcid":"https://orcid.org/0000-0003-2766-1143"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Linjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135174576","display_name":"Yisong Yue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yue, Yisong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135137796","display_name":"Qiushan Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Qiushan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9611999988555908,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9611999988555908,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.005499999970197678,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.003700000001117587,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.8927000164985657},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7098000049591064},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.6319000124931335},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.6227999925613403},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.48840001225471497},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.47600001096725464}],"concepts":[{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.8927000164985657},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7675999999046326},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7098000049591064},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.678600013256073},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.6319000124931335},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6227999925613403},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.48840001225471497},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.47600001096725464},{"id":"https://openalex.org/C194657046","wikidata":"https://www.wikidata.org/wiki/Q7394685","display_name":"STAR model","level":4,"score":0.45159998536109924},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.429500013589859},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.40139999985694885},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.30970001220703125},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.29980000853538513},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.2896000146865845},{"id":"https://openalex.org/C42536954","wikidata":"https://www.wikidata.org/wiki/Q7049462","display_name":"Nonlinear autoregressive exogenous model","level":3,"score":0.2759999930858612},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C77052588","wikidata":"https://www.wikidata.org/wiki/Q644307","display_name":"Constant false alarm rate","level":2,"score":0.2529999911785126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.00503","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.00503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.00503","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.00503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Autoregressive":[0],"image":[1],"modeling":[2],"relies":[3],"on":[4,79],"visual":[5],"tokenizers":[6,43,58],"to":[7,32,55],"compress":[8],"images":[9],"into":[10],"compact":[11],"latent":[12],"representations.":[13],"We":[14,48],"design":[15],"an":[16],"end-to-end":[17],"training":[18],"pipeline":[19],"that":[20,41],"jointly":[21],"optimizes":[22],"reconstruction":[23],"and":[24,44],"generation,":[25],"enabling":[26],"direct":[27],"supervision":[28],"from":[29],"generation":[30],"results":[31],"the":[33],"tokenizer.":[34],"This":[35],"contrasts":[36],"with":[37],"prior":[38],"two-stage":[39],"approaches":[40],"train":[42],"generative":[45,64],"models":[46,54],"separately.":[47],"further":[49],"investigate":[50],"leveraging":[51],"vision":[52],"foundation":[53],"improve":[56],"1D":[57],"for":[59],"autoregressive":[60,63],"modeling.":[61],"Our":[62],"model":[65],"achieves":[66],"strong":[67],"empirical":[68],"results,":[69],"including":[70],"a":[71],"state-of-the-art":[72],"FID":[73],"score":[74],"of":[75],"1.48":[76],"without":[77],"guidance":[78],"ImageNet":[80],"256x256":[81],"generation.":[82]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-05T00:00:00"}
