{"id":"https://openalex.org/W7138264028","doi":"https://doi.org/10.48550/arxiv.2603.15478","title":"ViFeEdit: A Video-Free Tuner of Your Video Diffusion Transformer","display_name":"ViFeEdit: A Video-Free Tuner of Your Video Diffusion Transformer","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7138264028","doi":"https://doi.org/10.48550/arxiv.2603.15478"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.15478","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15478","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.15478","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013672772","display_name":"Ruonan Yu","orcid":"https://orcid.org/0009-0008-4809-7119"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Ruonan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129666598","display_name":"Zhenxiong Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Zhenxiong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026637544","display_name":"Zigeng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zigeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129679298","display_name":"Songhua Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Songhua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129709231","display_name":"Xinchao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xinchao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5605000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5605000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11019","display_name":"Image Enhancement Techniques","score":0.07829999923706055,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10481","display_name":"Computer Graphics and Visualization Techniques","score":0.030799999833106995,"subfield":{"id":"https://openalex.org/subfields/1704","display_name":"Computer Graphics and Computer-Aided Design"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4821999967098236},{"id":"https://openalex.org/keywords/video-editing","display_name":"Video editing","score":0.42590001225471497},{"id":"https://openalex.org/keywords/video-processing","display_name":"Video processing","score":0.4194999933242798},{"id":"https://openalex.org/keywords/image-quality","display_name":"Image quality","score":0.3483000099658966},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.34040001034736633},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.32409998774528503},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.3197999894618988},{"id":"https://openalex.org/keywords/image-editing","display_name":"Image editing","score":0.3156000077724457}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7457000017166138},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4821999967098236},{"id":"https://openalex.org/C2780310081","wikidata":"https://www.wikidata.org/wiki/Q1154312","display_name":"Video editing","level":2,"score":0.42590001225471497},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.4194999933242798},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.41100001335144043},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39480000734329224},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.3707999885082245},{"id":"https://openalex.org/C55020928","wikidata":"https://www.wikidata.org/wiki/Q3813865","display_name":"Image quality","level":3,"score":0.3483000099658966},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.34040001034736633},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.32409998774528503},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3197999894618988},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3190999925136566},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.3156000077724457},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.31360000371932983},{"id":"https://openalex.org/C151211776","wikidata":"https://www.wikidata.org/wiki/Q2778015","display_name":"Video capture","level":3,"score":0.29510000348091125},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.289900004863739},{"id":"https://openalex.org/C30814859","wikidata":"https://www.wikidata.org/wiki/Q4119603","display_name":"Video denoising","level":5,"score":0.266400009393692},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.2630999982357025},{"id":"https://openalex.org/C103910844","wikidata":"https://www.wikidata.org/wiki/Q2631256","display_name":"Video quality","level":3,"score":0.2603999972343445},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C2776865275","wikidata":"https://www.wikidata.org/wiki/Q311666","display_name":"Projector","level":2,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.15478","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15478","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.15478","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15478","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Diffusion":[0],"Transformers":[1],"(DiTs)":[2],"have":[3],"demonstrated":[4],"remarkable":[5],"scalability":[6],"and":[7,11,23,36,49,91,171],"quality":[8],"in":[9,17,33,63,117,140],"image":[10,30,179],"video":[12,34,47,56,75,83,89,119,169],"generation,":[13],"prompting":[14],"growing":[15],"interest":[16],"extending":[18],"them":[19],"to":[20,28,42,154],"controllable":[21,168],"generation":[22,90,170],"editing":[24,37,126,172],"tasks.":[25],"However,":[26],"compared":[27],"the":[29,43,50,99,113],"counterparts,":[31],"progress":[32],"control":[35],"remains":[38],"limited,":[39],"mainly":[40],"due":[41],"scarcity":[44],"of":[45,54,82,101,167],"paired":[46],"data":[48],"high":[51],"computational":[52],"cost":[53],"training":[55,84,176],"diffusion":[57,76,120],"models.":[58],"To":[59],"address":[60],"this":[61,64,137],"issue,":[62],"paper,":[65],"we":[66],"propose":[67],"a":[68,141],"video-free":[69],"tuning":[70],"framework":[71],"termed":[72],"ViFeEdit":[73,86],"for":[74,148],"transformers.":[77],"Without":[78],"requiring":[79],"any":[80],"forms":[81],"data,":[85],"achieves":[87],"versatile":[88],"editing,":[92],"adapted":[93],"solely":[94],"with":[95,131,144,173],"2D":[96,178],"images.":[97],"At":[98],"core":[100],"our":[102,162],"approach":[103],"is":[104],"an":[105],"architectural":[106],"reparameterization":[107],"that":[108,161],"decouples":[109],"spatial":[110],"independence":[111],"from":[112],"full":[114],"3D":[115],"attention":[116],"modern":[118],"transformers,":[121],"which":[122],"enables":[123],"visually":[124],"faithful":[125],"while":[127],"maintaining":[128],"temporal":[129],"consistency":[130],"only":[132,174],"minimal":[133,175],"additional":[134],"parameters.":[135],"Moreover,":[136],"design":[138],"operates":[139],"dual-path":[142],"pipeline":[143],"separate":[145],"timestep":[146],"embeddings":[147],"noise":[149],"scheduling,":[150],"exhibiting":[151],"strong":[152],"adaptability":[153],"diverse":[155],"conditioning":[156],"signals.":[157],"Extensive":[158],"experiments":[159],"demonstrate":[160],"method":[163],"delivers":[164],"promising":[165],"results":[166],"on":[177],"data.":[180],"Codes":[181],"are":[182],"available":[183],"https://github.com/Lexie-YU/ViFeEdit.":[184]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-18T00:00:00"}
