{"id":"https://openalex.org/W7160950810","doi":"https://doi.org/10.48550/arxiv.2605.08250","title":"Why Do DiT Editors Drift? Plug-and-Play Low Frequency Alignment in VAE Latent Space","display_name":"Why Do DiT Editors Drift? Plug-and-Play Low Frequency Alignment in VAE Latent Space","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160950810","doi":"https://doi.org/10.48550/arxiv.2605.08250"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.08250","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08250","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.08250","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072622456","display_name":"Xiaoce Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiaoce","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103268513","display_name":"Sifan Zhou","orcid":"https://orcid.org/0000-0003-3602-7566"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Sifan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135971441","display_name":"Kaifei Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Kaifei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135916631","display_name":"Leli Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Leli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135938609","display_name":"Xuerui Qiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiu, Xuerui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135968340","display_name":"Tao He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Tao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135921115","display_name":"Ming Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Ming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7055000066757202,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7055000066757202,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12859","display_name":"Cell Image Analysis Techniques","score":0.03669999912381172,"subfield":{"id":"https://openalex.org/subfields/1304","display_name":"Biophysics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11019","display_name":"Image Enhancement Techniques","score":0.02630000002682209,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5878000259399414},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4474000036716461},{"id":"https://openalex.org/keywords/image-editing","display_name":"Image editing","score":0.41339999437332153},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.36469998955726624},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3549000024795532},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.33070001006126404},{"id":"https://openalex.org/keywords/latent-semantic-analysis","display_name":"Latent semantic analysis","score":0.32829999923706055}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7856000065803528},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5878000259399414},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48579999804496765},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4474000036716461},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.41339999437332153},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.36469998955726624},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3549000024795532},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.33070001006126404},{"id":"https://openalex.org/C170133592","wikidata":"https://www.wikidata.org/wiki/Q1806883","display_name":"Latent semantic analysis","level":2,"score":0.32829999923706055},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.29750001430511475},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.289900004863739},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.27880001068115234},{"id":"https://openalex.org/C175293574","wikidata":"https://www.wikidata.org/wiki/Q697133","display_name":"Word lists by frequency","level":3,"score":0.26489999890327454},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.26260000467300415},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.25290000438690186},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.08250","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08250","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.08250","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08250","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,51,97],"diffusion":[3,141],"transformers":[4],"(DiTs)":[5],"have":[6],"enabled":[7],"promising":[8],"single-turn":[9],"image":[10],"editing":[11,15,39,71,106,162,197],"capabilities.":[12],"However,":[13],"multi-turn":[14,196],"often":[16],"leads":[17],"to":[18,115,140,146,179],"progressive":[19],"semantic":[20,68,126,189],"drift":[21,64,127],"and":[22,46,111,149,191,202],"quality":[23],"degradation.In":[24],"this":[25,29,82],"work,":[26],"we":[27,56,84],"study":[28],"problem":[30],"from":[31],"a":[32,90],"latent-space":[33],"frequency":[34],"perspective":[35],"by":[36,164],"decomposing":[37],"the":[38,52,59,74,161],"process":[40],"into":[41,160],"two":[42],"functional":[43],"components:":[44],"VAE":[45,53,75,98,167,178],"DiT.":[47],"Through":[48],"systematic":[49],"analysis":[50],"latent":[54,99,103,182],"space,":[55],"uncover":[57],"that":[58,65,94,186],"DiT":[60,151],"introduces":[61],"dominant":[62],"low-frequency":[63,113],"accumulates":[66],"as":[67],"misalignment":[69],"across":[70,105,194],"rounds,":[72,122],"while":[73,128],"contributes":[76],"comparatively":[77],"stable":[78],"reconstruction":[79],"bias.Based":[80],"on":[81],"insight,":[83],"propose":[85],"VAE-LFA":[86,101,156,187],"(Low":[87],"Frequency":[88],"Alignment),":[89],"training-free,":[91],"plug-and-play":[92],"method":[93,132],"performs":[95],"alignment":[96],"space.":[100],"decomposes":[102],"discrepancies":[104],"rounds":[107],"via":[108,175],"low-pass":[109],"filtering,":[110],"aligns":[112],"statistics":[114],"an":[116,176],"exponential":[117],"moving":[118],"average":[119],"of":[120],"previous":[121],"effectively":[123],"suppressing":[124],"accumulated":[125],"preserving":[129],"high-frequency":[130],"details.Our":[131],"requires":[133],"no":[134],"retraining,":[135],"ground-truth":[136],"priors,":[137],"or":[138],"access":[139],"parameters,":[142],"making":[143],"it":[144,173],"applicable":[145],"both":[147,200],"white-box":[148,154],"black-box":[150,171],"editors.":[152],"For":[153],"models,":[155,172],"is":[157],"seamlessly":[158],"integrated":[159],"pipeline":[163],"eliminating":[165],"redundant":[166],"round":[168],"trips;":[169],"for":[170],"operates":[174],"off-the-shelf":[177],"perform":[180],"inter-round":[181],"alignment.Extensive":[183],"experiments":[184],"demonstrate":[185],"improves":[188],"consistency":[190],"visual":[192],"fidelity":[193],"diverse":[195],"scenarios,":[198],"including":[199],"controlled":[201],"in-the-wild":[203],"images.":[204]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-13T00:00:00"}
