{"id":"https://openalex.org/W4417051774","doi":"https://doi.org/10.1109/iccv51701.2025.01633","title":"Training-Free Text-Guided Image Editing with Visual Autoregressive Model","display_name":"Training-Free Text-Guided Image Editing with Visual Autoregressive Model","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4417051774","doi":"https://doi.org/10.1109/iccv51701.2025.01633"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.01633","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01633","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2503.23897","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100374826","display_name":"Yu\u2010Fei Wang","orcid":"https://orcid.org/0000-0002-4530-2630"},"institutions":[{"id":"https://openalex.org/I4210142583","display_name":"Snap (United States)","ror":"https://ror.org/04dgkhg68","country_code":"US","type":"company","lineage":["https://openalex.org/I4210142583"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yufei Wang","raw_affiliation_strings":["Snap Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Snap Research","institution_ids":["https://openalex.org/I4210142583"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101553481","display_name":"Lanqing Guo","orcid":"https://orcid.org/0000-0002-9452-4723"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lanqing Guo","raw_affiliation_strings":["UT Austin"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UT Austin","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035588290","display_name":"Zhihao Li","orcid":"https://orcid.org/0000-0003-0364-6627"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Zhihao Li","raw_affiliation_strings":["Nanyang Technological University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nanyang Technological University","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074027508","display_name":"Jiaxing Huang","orcid":"https://orcid.org/0000-0001-9176-8901"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Jiaxing Huang","raw_affiliation_strings":["Nanyang Technological University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nanyang Technological University","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042680345","display_name":"Pichao Wang","orcid":"https://orcid.org/0000-0002-1430-0237"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Pichao Wang","raw_affiliation_strings":["Nanyang Technological University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nanyang Technological University","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102557254","display_name":"Bihan Wen","orcid":null},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Bihan Wen","raw_affiliation_strings":["Nanyang Technological University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nanyang Technological University","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100333084","display_name":"Jian Wang","orcid":"https://orcid.org/0000-0002-2525-4272"},"institutions":[{"id":"https://openalex.org/I4210142583","display_name":"Snap (United States)","ror":"https://ror.org/04dgkhg68","country_code":"US","type":"company","lineage":["https://openalex.org/I4210142583"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jian Wang","raw_affiliation_strings":["Snap Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Snap Research","institution_ids":["https://openalex.org/I4210142583"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100374826"],"corresponding_institution_ids":["https://openalex.org/I4210142583"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.3663326,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"17577","last_page":"17586"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6940000057220459,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6940000057220459,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.04149999842047691,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12357","display_name":"Digital Media Forensic Detection","score":0.019300000742077827,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/image-editing","display_name":"Image editing","score":0.6873999834060669},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6572999954223633},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.5716999769210815},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.4607999920845032},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.41909998655319214},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4165000021457672},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.39800000190734863},{"id":"https://openalex.org/keywords/masking","display_name":"Masking (illustration)","score":0.35569998621940613}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8547999858856201},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.6873999834060669},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6572999954223633},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.5716999769210815},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5317999720573425},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.4607999920845032},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43130001425743103},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.41909998655319214},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4165000021457672},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.39800000190734863},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.35569998621940613},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.334199994802475},{"id":"https://openalex.org/C1893757","wikidata":"https://www.wikidata.org/wiki/Q3653001","display_name":"Inversion (geology)","level":3,"score":0.33090001344680786},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3190999925136566},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.27889999747276306},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C2779200073","wikidata":"https://www.wikidata.org/wiki/Q18395575","display_name":"Visual masking","level":4,"score":0.26460000872612},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.25859999656677246}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.01633","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01633","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2503.23897","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.23897","pdf_url":"https://arxiv.org/pdf/2503.23897","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2503.23897","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.23897","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2503.23897","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.23897","pdf_url":"https://arxiv.org/pdf/2503.23897","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Text-guided":[0],"image":[1,67,89,193],"editing":[2,28,90,167,184],"is":[3],"an":[4,143],"essential":[5],"task":[6],"that":[7,117,148,203],"enables":[8],"users":[9],"to":[10,35,50,154],"modify":[11],"images":[12],"through":[13],"natural":[14],"language":[15],"descriptions.":[16],"Recent":[17],"advances":[18],"in":[19,44,71,177,194,219],"diffusion":[20],"models":[21],"and":[22,53,66,108,121,135,151,172,181,215,223],"rectified":[23,216],"flows":[24],"have":[25],"significantly":[26],"improved":[27],"quality,":[29],"primarily":[30],"relying":[31],"on":[32,93],"inversion":[33,45,104],"techniques":[34],"extract":[36],"structured":[37],"noise":[38],"from":[39,124],"input":[40],"images.":[41],"However,":[42],"inaccuracies":[43],"can":[46],"propagate":[47],"errors,":[48],"leading":[49],"unintended":[51,158],"modifications":[52,153],"compromising":[54],"fidelity.":[55],"Moreover,":[56],"even":[57,211],"with":[58,185],"perfect":[59],"inversion,":[60],"the":[61,100,125,129,132,136,166],"entanglement":[62],"between":[63,131],"textual":[64],"prompts":[65],"features":[68],"often":[69],"results":[70],"global":[72],"changes":[73],"when":[74],"only":[75],"local":[76],"edits":[77],"are":[78],"intended.":[79],"To":[80],"address":[81],"these":[82],"challenges,":[83],"we":[84,141],"propose":[85],"a":[86,114,178,190],"novel":[87],"text-guided":[88],"framework":[91,175],"based":[92],"VAR":[94],"(Visual":[95],"AutoRegressive":[96],"modeling),":[97],"which":[98],"eliminates":[99],"need":[101],"for":[102],"explicit":[103],"while":[105],"ensuring":[106],"precise":[107],"controlled":[109],"modifications.":[110],"Our":[111,174],"method":[112,205],"introduces":[113],"caching":[115],"mechanism":[116],"stores":[118],"token":[119,161],"indices":[120],"probability":[122],"distributions":[123],"original":[126],"image,":[127],"capturing":[128],"relationship":[130],"source":[133],"prompt":[134],"image.":[137],"Using":[138],"this":[139],"cache,":[140],"design":[142],"adaptive":[144],"fine-grained":[145],"masking":[146],"strategy":[147],"dynamically":[149],"identifies":[150],"constrains":[152],"relevant":[155],"regions,":[156],"preventing":[157],"changes.":[159],"A":[160],"reassembling":[162],"approach":[163],"further":[164],"refines":[165],"process,":[168],"enhancing":[169],"diversity,":[170],"fidelity,":[171],"control.":[173],"operates":[176],"training-free":[179],"manner":[180],"achieves":[182,206],"high-fidelity":[183],"faster":[186],"inference":[187],"speeds,":[188],"processing":[189],"1K":[191],"resolution":[192],"as":[195,197],"fast":[196],"1.2":[198],"seconds.":[199],"Extensive":[200],"experiments":[201],"demonstrate":[202],"our":[204],"performance":[207],"comparable":[208],"to,":[209],"or":[210],"surpassing,":[212],"existing":[213],"diffusion-":[214],"flow-based":[217],"approaches":[218],"both":[220],"quantitative":[221],"metrics":[222],"visual":[224],"quality.":[225],"The":[226],"code":[227],"will":[228],"be":[229],"released.":[230]},"counts_by_year":[],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2025-10-10T00:00:00"}
