{"id":"https://openalex.org/W7154483354","doi":"https://doi.org/10.48550/arxiv.2604.12616","title":"Every Picture Tells a Dangerous Story: Memory-Augmented Multi-Agent Jailbreak Attacks on VLMs","display_name":"Every Picture Tells a Dangerous Story: Memory-Augmented Multi-Agent Jailbreak Attacks on VLMs","publication_year":2026,"publication_date":"2026-04-14","ids":{"openalex":"https://openalex.org/W7154483354","doi":"https://doi.org/10.48550/arxiv.2604.12616"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.12616","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12616","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.12616","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133670874","display_name":"Jianhao Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Jianhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133702911","display_name":"Haoyang Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Haoyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133663371","display_name":"Hanjie Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Hanjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053076410","display_name":"Haozhe Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Haozhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133707210","display_name":"Tieyun Qian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian, Tieyun","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5133670874"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9639000296592712,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9639000296592712,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.008200000040233135,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12357","display_name":"Digital Media Forensic Detection","score":0.0052999998442828655,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.698199987411499},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.6503000259399414},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.6061000227928162},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5572999715805054},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3855000138282776},{"id":"https://openalex.org/keywords/projection","display_name":"Projection (relational algebra)","score":0.36250001192092896},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.3353999853134155},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.33239999413490295}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7516999840736389},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.698199987411499},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.6503000259399414},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.6061000227928162},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5572999715805054},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5006999969482422},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3871000111103058},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3855000138282776},{"id":"https://openalex.org/C57493831","wikidata":"https://www.wikidata.org/wiki/Q3134666","display_name":"Projection (relational algebra)","level":2,"score":0.36250001192092896},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.3353999853134155},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.33239999413490295},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3314000070095062},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3278000056743622},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3208000063896179},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3199999928474426},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.3133000135421753},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.28870001435279846},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.2872999906539917},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26919999718666077},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.26030001044273376},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2574999928474426},{"id":"https://openalex.org/C156325763","wikidata":"https://www.wikidata.org/wiki/Q1930895","display_name":"Operational semantics","level":3,"score":0.2549000084400177}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.12616","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12616","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.12616","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12616","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.4089631140232086,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"rapid":[1],"evolution":[2],"of":[3,67],"Vision-Language":[4],"Models":[5],"(VLMs)":[6],"has":[7,19],"catalyzed":[8],"unprecedented":[9],"capabilities":[10],"in":[11],"artificial":[12],"intelligence;":[13],"however,":[14,46],"this":[15],"continuous":[16],"modal":[17],"expansion":[18],"inadvertently":[20],"exposed":[21],"a":[22,86,144,184,206,218],"vastly":[23],"broadened":[24],"and":[25,40,122,139],"unconstrained":[26],"adversarial":[27,116],"attack":[28,65,156,164,215],"surface.":[29],"Current":[30],"multimodal":[31,213],"jailbreak":[32,100,155,214],"strategies":[33,142],"primarily":[34],"focus":[35],"on":[36,168],"surface-level":[37],"pixel":[38],"perturbations":[39],"typographic":[41],"attacks":[42],"or":[43],"harmful":[44],"images;":[45],"they":[47],"fail":[48],"to":[49,57,77,97,107,112,131,190,196],"engage":[50],"with":[51],"the":[52,62,75,163],"complex":[53],"semantic":[54,64,81],"structures":[55],"intrinsic":[56],"visual":[58,95,110],"data.":[59],"This":[60],"leaves":[61],"vast":[63],"surface":[66],"original,":[68],"natural":[69],"images":[70,179],"largely":[71],"unscrutinized.":[72],"Driven":[73],"by":[74],"need":[76],"expose":[78],"these":[79],"deep-seated":[80],"vulnerabilities,":[82],"we":[83,202],"introduce":[84],"\\textbf{MemJack},":[85],"\\textbf{MEM}ory-augmented":[87],"multi-agent":[88,105],"\\textbf{JA}ilbreak":[89],"atta\\textbf{CK}":[90],"framework":[91],"that":[92,181],"explicitly":[93],"leverages":[94],"semantics":[96],"orchestrate":[98],"automated":[99],"attacks.":[101],"MemJack":[102,149,182],"employs":[103],"coordinated":[104],"cooperation":[106],"dynamically":[108],"map":[109],"entities":[111],"malicious":[113],"intents,":[114],"generate":[115],"prompts":[117],"via":[118],"multi-angle":[119],"visual-semantic":[120],"camouflage,":[121],"utilize":[123],"an":[124],"Iterative":[125],"Nullspace":[126],"Projection":[127],"(INLP)":[128],"geometric":[129],"filter":[130],"bypass":[132],"premature":[133],"latent":[134],"space":[135],"refusals.":[136],"By":[137],"accumulating":[138],"transferring":[140],"successful":[141],"through":[143],"persistent":[145],"Multimodal":[146],"Experience":[147],"Memory,":[148],"maintains":[150],"highly":[151],"coherent":[152],"extended":[153,193],"multi-turn":[154],"interactions":[157],"across":[158,174],"different":[159],"images,":[160],"thereby":[161],"improving":[162],"success":[165],"rate":[166],"(ASR)":[167],"new":[169],"images.":[170],"Extensive":[171],"empirical":[172],"evaluations":[173],"full,":[175],"unmodified":[176],"COCO":[177],"val2017":[178],"demonstrate":[180],"achieves":[183],"71.48\\%":[185],"ASR":[186],"against":[187],"Qwen3-VL-Plus,":[188],"scaling":[189],"90\\%":[191],"under":[192],"budgets.":[194],"Furthermore,":[195],"catalyze":[197],"future":[198],"defensive":[199],"alignment":[200],"research,":[201],"will":[203],"release":[204],"\\textbf{MemJack-Bench},":[205],"comprehensive":[207],"dataset":[208],"comprising":[209],"over":[210],"113,000":[211],"interactive":[212],"trajectories,":[216],"establishing":[217],"vital":[219],"foundation":[220],"for":[221],"developing":[222],"inherently":[223],"robust":[224],"VLMs.":[225]},"counts_by_year":[],"updated_date":"2026-04-16T06:09:31.884825","created_date":"2026-04-16T00:00:00"}
