{"id":"https://openalex.org/W7138461545","doi":"https://doi.org/10.1609/aaai.v40i7.37436","title":"Text-Guided Gradient Refinement: Resolving Multimodal Gradient Conflicts to Boost Adversarial Attacks on Vision-Language Models","display_name":"Text-Guided Gradient Refinement: Resolving Multimodal Gradient Conflicts to Boost Adversarial Attacks on Vision-Language Models","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138461545","doi":"https://doi.org/10.1609/aaai.v40i7.37436"},"language":"en","primary_location":{"id":"doi:10.1609/aaai.v40i7.37436","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i7.37436","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37436/41398","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37436/41398","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129680859","display_name":"Yuyang Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yuyang Huang","raw_affiliation_strings":["ByteDance Inc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance Inc","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025152125","display_name":"Tianzuo Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tianzuo Luo","raw_affiliation_strings":["ByteDance Inc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance Inc","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129743419","display_name":"Hengyuan Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hengyuan Guo","raw_affiliation_strings":["ByteDance Inc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance Inc","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5002690636","display_name":"Yuren Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuren Zhang","raw_affiliation_strings":["ByteDance Inc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance Inc","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5129680859"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.58107466,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"7","first_page":"5212","last_page":"5220"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7888000011444092,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7888000011444092,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.05920000001788139,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.05770000070333481,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.8738999962806702},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.6694999933242798},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.612500011920929},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.43140000104904175},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.3684999942779541},{"id":"https://openalex.org/keywords/projection","display_name":"Projection (relational algebra)","score":0.3677000105381012}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.8738999962806702},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7211999893188477},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6694999933242798},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.612500011920929},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5407000184059143},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.43140000104904175},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.3684999942779541},{"id":"https://openalex.org/C57493831","wikidata":"https://www.wikidata.org/wiki/Q3134666","display_name":"Projection (relational algebra)","level":2,"score":0.3677000105381012},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3278000056743622},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.32249999046325684},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3028999865055084},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.2809000015258789},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.27149999141693115},{"id":"https://openalex.org/C2986089797","wikidata":"https://www.wikidata.org/wiki/Q6501338","display_name":"Visual attention","level":3,"score":0.2669000029563904}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1609/aaai.v40i7.37436","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i7.37436","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37436/41398","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},{"id":"pmh:oai:ojs.aaai.org:article/37436","is_oa":false,"landing_page_url":"https://ojs.aaai.org/index.php/AAAI/article/view/37436","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"2159-5399","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i7.37436","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i7.37436","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37436/41398","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138461545.pdf","grobid_xml":"https://content.openalex.org/works/W7138461545.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"(VLMs)":[2],"have":[3],"advanced":[4],"multimodal":[5],"understanding,":[6],"yet":[7],"they":[8],"remain":[9],"susceptible":[10],"to":[11,35,62,150,202],"adversarial":[12,224],"attacks.":[13],"Among":[14],"various":[15],"strategies,":[16],"transfer-based":[17],"attacks":[18],"are":[19],"notably":[20],"effective,":[21],"especially":[22],"in":[23,84],"black-box":[24],"scenarios.":[25],"The":[26],"dominant":[27],"approach":[28,47],"within":[29],"this":[30,46,69,152],"paradigm":[31],"leverages":[32],"generative":[33,53],"models":[34,54,179],"create":[36],"image":[37,88,160,165],"targets":[38,161],"from":[39,49,127],"text,":[40],"consistently":[41],"outperforming":[42],"text-only":[43],"methods.":[44],"However,":[45],"suffers":[48],"a":[50,141,146,215],"fundamental":[51],"limitation:":[52],"introduce":[55],"visual":[56,119],"features":[57],"irrelevant":[58],"or":[59],"even":[60],"detrimental":[61],"textual":[63,129,174],"semantics,":[64],"misguiding":[65],"optimization.":[66],"To":[67],"investigate":[68],"limitation,":[70],"we":[71,135],"conduct":[72],"comprehensive":[73],"analysis":[74],"revealing":[75],"two":[76],"critical":[77],"findings.":[78],"First,":[79],"optimal":[80],"attack":[81,190,209],"directions":[82],"lie":[83],"synergistic":[85,221],"spaces":[86],"between":[87],"and":[89,167,183,222],"text":[90,94],"gradients,":[91],"demonstrating":[92],"that":[93,117,144,171,186],"provides":[95,114],"complementary":[96],"information.":[97],"Second,":[98],"widespread":[99],"gradient":[100,166],"conflicts":[101],"occur":[102],"when":[103],"combining":[104],"modalities,":[105],"where":[106],"image-target":[107],"gradients":[108],"oppose":[109,172],"text-target":[110],"directions.":[111],"This":[112],"conflict":[113],"direct":[115],"evidence":[116],"extraneous":[118],"information":[120],"actively":[121],"harms":[122],"optimization,":[123],"driving":[124],"it":[125],"away":[126],"intended":[128],"objectives.":[130],"Based":[131],"on":[132,178,194],"these":[133],"insights,":[134],"propose":[136],"Text-Guided":[137],"Gradient":[138],"Refinement":[139],"(TGGR),":[140],"novel":[142],"framework":[143,217],"employs":[145],"conflict-aware":[147],"projection":[148],"mechanism":[149],"resolve":[151],"conflict.":[153],"TGGR":[154,187,196],"preserves":[155],"the":[156,164,173],"beneficial":[157],"characteristics":[158],"of":[159,200],"by":[162],"decomposing":[163],"surgically":[168],"removing":[169],"components":[170],"guidance.":[175],"Extensive":[176],"experiments":[177],"such":[180],"as":[181],"LLaVA":[182],"GPT-4o":[184],"demonstrate":[185],"substantially":[188],"improves":[189],"success":[191,210],"rates.":[192],"Specifically,":[193],"GPT-4o,":[195],"yields":[197],"an":[198],"improvement":[199],"up":[201],"14%":[203],"over":[204],"state-of-the-art":[205],"methods,":[206],"achieving":[207],"96%":[208],"rate.":[211],"Our":[212],"work":[213],"offers":[214],"principled":[216],"for":[218],"developing":[219],"more":[220],"effective":[223],"strategies":[225],"against":[226],"VLMs.":[227]},"counts_by_year":[],"updated_date":"2026-06-06T09:05:17.133730","created_date":"2026-03-18T00:00:00"}
