{"id":"https://openalex.org/W7161713748","doi":"https://doi.org/10.48550/arxiv.2605.18324","title":"Improved Baselines with Representation Autoencoders","display_name":"Improved Baselines with Representation Autoencoders","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161713748","doi":"https://doi.org/10.48550/arxiv.2605.18324"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.18324","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18324","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.18324","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136502993","display_name":"Jaskirat Singh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Jaskirat","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136471617","display_name":"Boyang Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Boyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136473238","display_name":"Zongze Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zongze","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136487620","display_name":"Richard Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Richard","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024777042","display_name":"Eli Shechtman","orcid":"https://orcid.org/0000-0002-6783-1795"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shechtman, Eli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136499329","display_name":"Saining Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Saining","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5491999983787537,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5491999983787537,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.27950000762939453,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.05290000140666962,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.8691999912261963},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7792999744415283},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.5597000122070312},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.5462999939918518},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.42570000886917114},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.4212999939918518},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4083999991416931}],"concepts":[{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.8691999912261963},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7792999744415283},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.72079998254776},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.605400025844574},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.5597000122070312},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.5462999939918518},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4424000084400177},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.42570000886917114},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.4212999939918518},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4083999991416931},{"id":"https://openalex.org/C77660490","wikidata":"https://www.wikidata.org/wiki/Q244916","display_name":"Intermediate language","level":3,"score":0.3749000132083893},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.36809998750686646},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.29679998755455017},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.2851000130176544},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.28139999508857727},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27799999713897705},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.26910001039505005},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2603999972343445},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.26010000705718994}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.18324","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18324","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.18324","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18324","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Representation":[0],"Autoencoders":[1],"(RAE)":[2],"replace":[3],"traditional":[4],"VAE":[5],"with":[6,131],"pretrained":[7,77],"vision":[8],"encoders.":[9],"In":[10],"this":[11],"paper,":[12],"we":[13,29,69,98],"systematically":[14],"investigate":[15],"several":[16],"design":[17],"choices":[18],"and":[19,25,104,120,135,262],"find":[20],"three":[21],"insights":[22],"which":[23,85],"simplify":[24],"improve":[26],"RAE.":[27,250],"First,":[28],"study":[30,70],"a":[31,100,138,189,204,233],"generalized":[32],"formulation":[33],"where":[34],"the":[35,42,50,71,87,111,127,163,166,185,213,248],"representation":[36,78,82,89,113],"is":[37,271],"defined":[38],"as":[39,79,117,154,232],"sum":[40],"of":[41,165,192,235,242],"last":[43],"k":[44],"encoder":[45,60,119],"layers":[46,92],"rather":[47],"than":[48,180],"solely":[49],"final":[51],"layer.":[52],"This":[53,222],"simple":[54],"change":[55],"greatly":[56],"improves":[57],"reconstruction":[58],"without":[59,219],"finetuning":[61],"or":[62],"specialized":[63],"data":[64],"(e.g.,":[65],"text,":[66],"faces).":[67],"Second,":[68],"prevalent":[72],"assumption":[73],"that":[74,148],"RAE":[75,103,129,157],"(using":[76],"encoder)":[80],"replaces":[81],"alignment":[83],"(REPA),":[84],"distills":[86],"same":[88,112],"to":[90,114,178,212,226],"intermediate":[91,123],"instead.":[93],"Through":[94],"large-scale":[95],"empirical":[96],"analysis,":[97],"uncover":[99],"surprising":[101],"finding:":[102],"REPA":[105,149],"exhibit":[106],"complementary":[107],"working":[108],"mechanisms,":[109],"allowing":[110],"be":[115,152],"used":[116],"both":[118],"target":[121],"for":[122,143,173,247,259],"diffusion":[124,141],"layers.":[125],"Finally,":[126],"original":[128,186,249],"struggles":[130],"classifier-free":[132],"guidance":[133,172],"(CFG)":[134],"requires":[136],"training":[137,236],"second,":[139],"weaker":[140],"model":[142],"AutoGuidance":[144],"(AG).":[145],"We":[146,251],"show":[147],"itself":[150],"can":[151,170],"viewed":[153],"x-prediction":[155],"in":[156,194],"latent":[158],"space.":[159],"By":[160],"simply":[161],"re-parameterizing":[162],"output":[164],"DiT":[167],"model,":[168],"it":[169],"provide":[171],"\"free\".":[174],"Overall,":[175],"RAEv2":[176,202,238],"leads":[177],"more":[179],"10x":[181],"faster":[182],"convergence":[183],"over":[184],"RAE,":[187],"achieving":[188],"state-of-the-art":[190,205],"gFID":[191,229],"1.06":[193],"just":[195,208],"80":[196,209],"epochs":[197,210],"on":[198],"ImageNet-256.":[199],"On":[200],"FDr6,":[201],"achieves":[203],"2.17":[206],"at":[207,273],"compared":[211],"previous":[214],"best":[215],"3.26":[216],"(800":[217],"epochs)":[218],"any":[220],"post-training.":[221],"motivates":[223],"EPFID@k":[224],"(epochs":[225],"reach":[227],"unguided":[228],"&lt;":[230],"k)":[231],"measure":[234],"efficiency.":[237],"attains":[239],"an":[240],"EPFID@2":[241],"35":[243],"epochs,":[244],"versus":[245],"177":[246],"also":[252],"validate":[253],"our":[254],"approach":[255],"across":[256],"diverse":[257],"settings":[258],"text-to-image":[260],"generation":[261],"navigation":[263],"world":[264],"models,":[265],"showing":[266],"consistent":[267],"improvements.":[268],"The":[269],"code":[270],"available":[272],"https://raev2.github.io.":[274]},"counts_by_year":[],"updated_date":"2026-06-17T06:14:20.161405","created_date":"2026-05-20T00:00:00"}
