{"id":"https://openalex.org/W7140202574","doi":"https://doi.org/10.48550/arxiv.2603.22283","title":"End-to-End Training for Unified Tokenization and Latent Denoising","display_name":"End-to-End Training for Unified Tokenization and Latent Denoising","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140202574","doi":"https://doi.org/10.48550/arxiv.2603.22283"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.22283","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22283","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.22283","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Duggal, Shivam","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Duggal, Shivam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Bai, Xingjian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Xingjian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wu, Zongze","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zongze","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Richard","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Richard","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Shechtman, Eli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shechtman, Eli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Torralba, Antonio","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Torralba, Antonio","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Isola, Phillip","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Isola, Phillip","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Freeman, William T.","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Freeman, William T.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8931000232696533,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8931000232696533,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.04529999941587448,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.006000000052154064,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.8263999819755554},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6033999919891357},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5339999794960022},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.47040000557899475},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.46209999918937683},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4586000144481659},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4424999952316284},{"id":"https://openalex.org/keywords/probabilistic-latent-semantic-analysis","display_name":"Probabilistic latent semantic analysis","score":0.42170000076293945},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.38499999046325684}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8313000202178955},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.8263999819755554},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6258999705314636},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6033999919891357},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5339999794960022},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.47040000557899475},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.46209999918937683},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4586000144481659},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4424999952316284},{"id":"https://openalex.org/C112933361","wikidata":"https://www.wikidata.org/wiki/Q2845258","display_name":"Probabilistic latent semantic analysis","level":2,"score":0.42170000076293945},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3873000144958496},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.38499999046325684},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.3237000107765198},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.323199987411499},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2962000072002411},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.29440000653266907},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.2903999984264374},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.28999999165534973},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2888000011444092},{"id":"https://openalex.org/C500882744","wikidata":"https://www.wikidata.org/wiki/Q269236","display_name":"Latent Dirichlet allocation","level":3,"score":0.2865000069141388},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C51167844","wikidata":"https://www.wikidata.org/wiki/Q4422623","display_name":"Latent variable","level":2,"score":0.27810001373291016},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2662000060081482},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C170133592","wikidata":"https://www.wikidata.org/wiki/Q1806883","display_name":"Latent semantic analysis","level":2,"score":0.25940001010894775},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.2540999948978424}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.22283","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22283","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.22283","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22283","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Latent":[0],"diffusion":[1,28],"models":[2,178],"(LDMs)":[3],"enable":[4,135],"high-fidelity":[5],"synthesis":[6],"by":[7,110],"operating":[8],"in":[9,33],"learned":[10],"latent":[11,36,49,64,83,141,146],"spaces.":[12],"However,":[13],"training":[14,116,205],"state-of-the-art":[15],"LDMs":[16],"requires":[17],"complex":[18],"staging:":[19],"a":[20,54,114,144],"tokenizer":[21,62],"must":[22],"be":[23,31,78],"trained":[24,32],"first,":[25],"before":[26],"the":[27,34,81,128,140,158,187,191],"model":[29],"can":[30,77],"frozen":[35],"space.":[37],"We":[38,184],"propose":[39],"UNITE":[40,51,153],"-":[41],"an":[42],"autoencoder":[43],"architecture":[44],"for":[45,174],"unified":[46],"tokenization":[47,74,90,207],"and":[48,63,75,150,172,176,196],"diffusion.":[50],"consists":[52],"of":[53,157,193,206],"Generative":[55,130,188],"Encoder":[56,189],"that":[57,73,118,201],"serves":[58],"as":[59,80],"both":[60,121],"image":[61,149],"generator":[65],"via":[66,123],"weight":[67],"sharing.":[68],"Our":[69],"key":[70],"insight":[71],"is":[72,212],"generation":[76,98,209],"viewed":[79],"same":[82,129],"inference":[84],"problem":[85],"under":[86],"different":[87],"conditioning":[88],"regimes:":[89],"infers":[91,99],"latents":[92],"from":[93,101,210],"fully":[94],"observed":[95],"images,":[96],"whereas":[97],"them":[100],"noise":[102],"together":[103],"with":[104],"text":[105],"or":[106,164],"class":[107],"conditioning.":[108],"Motivated":[109],"this,":[111],"we":[112],"introduce":[113],"single-stage":[115],"procedure":[117],"jointly":[119,138],"optimizes":[120],"tasks":[122],"two":[124],"forward":[125],"passes":[126],"through":[127,190],"Encoder.":[131],"The":[132],"shared":[133],"parameters":[134],"gradients":[136],"to":[137],"shape":[139],"space,":[142],"encouraging":[143],"\"common":[145],"language\".":[147],"Across":[148],"molecule":[151],"modalities,":[152],"achieves":[154],"near":[155],"state":[156],"art":[159],"performance":[160],"without":[161],"adversarial":[162],"losses":[163],"pretrained":[165],"encoders":[166],"(e.g.,":[167],"DINO),":[168],"reaching":[169],"FID":[170],"2.12":[171],"1.73":[173],"Base":[175],"Large":[177],"on":[179],"ImageNet":[180],"256":[181],"x":[182],"256.":[183],"further":[185],"analyze":[186],"lenses":[192],"representation":[194],"alignment":[195],"compression.":[197],"These":[198],"results":[199],"show":[200],"single":[202],"stage":[203],"joint":[204],"&amp;":[208],"scratch":[211],"feasible.":[213]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-25T00:00:00"}
