{"id":"https://openalex.org/W7161710229","doi":"https://doi.org/10.1007/s00371-026-04480-4","title":"Adaptive ensemble attack: breaking Large Multimodal Models via dynamic caption selection and weighted gradients","display_name":"Adaptive ensemble attack: breaking Large Multimodal Models via dynamic caption selection and weighted gradients","publication_year":2026,"publication_date":"2026-05-19","ids":{"openalex":"https://openalex.org/W7161710229","doi":"https://doi.org/10.1007/s00371-026-04480-4"},"language":"en","primary_location":{"id":"doi:10.1007/s00371-026-04480-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00371-026-04480-4","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00371-026-04480-4.pdf","source":{"id":"https://openalex.org/S73060445","display_name":"The Visual Computer","issn_l":"0178-2789","issn":["0178-2789","1432-2315"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Visual Computer","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s00371-026-04480-4.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5019645361","display_name":"Sudhir Kumar Pandey","orcid":"https://orcid.org/0000-0003-3807-9423"},"institutions":[{"id":"https://openalex.org/I10535382","display_name":"Chongqing University of Posts and Telecommunications","ror":"https://ror.org/03dgaqz26","country_code":"CN","type":"education","lineage":["https://openalex.org/I10535382"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sudhir Kumar Pandey","raw_affiliation_strings":["Chongqing Key Laboratory of Image Cognition, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","Key Laboratory of Big Data Intelligent Computing, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","School of Computer Science and Technology, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chongqing Key Laboratory of Image Cognition, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","institution_ids":["https://openalex.org/I10535382"]},{"raw_affiliation_string":"Key Laboratory of Big Data Intelligent Computing, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","institution_ids":["https://openalex.org/I10535382"]},{"raw_affiliation_string":"School of Computer Science and Technology, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","institution_ids":["https://openalex.org/I10535382"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136497337","display_name":"Jian-Xun Mi","orcid":null},"institutions":[{"id":"https://openalex.org/I10535382","display_name":"Chongqing University of Posts and Telecommunications","ror":"https://ror.org/03dgaqz26","country_code":"CN","type":"education","lineage":["https://openalex.org/I10535382"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian-Xun Mi","raw_affiliation_strings":["Chongqing Key Laboratory of Image Cognition, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","Key Laboratory of Big Data Intelligent Computing, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","School of Computer Science and Technology, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chongqing Key Laboratory of Image Cognition, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","institution_ids":["https://openalex.org/I10535382"]},{"raw_affiliation_string":"Key Laboratory of Big Data Intelligent Computing, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","institution_ids":["https://openalex.org/I10535382"]},{"raw_affiliation_string":"School of Computer Science and Technology, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","institution_ids":["https://openalex.org/I10535382"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135414295","display_name":"Israr Ahmad","orcid":null},"institutions":[{"id":"https://openalex.org/I10535382","display_name":"Chongqing University of Posts and Telecommunications","ror":"https://ror.org/03dgaqz26","country_code":"CN","type":"education","lineage":["https://openalex.org/I10535382"]},{"id":"https://openalex.org/I42934936","display_name":"Dublin City University","ror":"https://ror.org/04a1a1e81","country_code":"IE","type":"education","lineage":["https://openalex.org/I42934936"]}],"countries":["CN","IE"],"is_corresponding":false,"raw_author_name":"Israr Ahmad","raw_affiliation_strings":["Key Lab of Computer Network and Communication Technology, Dublin City University, Dublin, Ireland","School of Computer Science and Technology, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Key Lab of Computer Network and Communication Technology, Dublin City University, Dublin, Ireland","institution_ids":["https://openalex.org/I42934936"]},{"raw_affiliation_string":"School of Computer Science and Technology, Chongqing University of Posts and Telecommunications, Chongqing, 400065, China","institution_ids":["https://openalex.org/I10535382"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5051139581","display_name":"Muhammad Salman Pathan","orcid":"https://orcid.org/0000-0002-0210-3121"},"institutions":[{"id":"https://openalex.org/I42934936","display_name":"Dublin City University","ror":"https://ror.org/04a1a1e81","country_code":"IE","type":"education","lineage":["https://openalex.org/I42934936"]}],"countries":["IE"],"is_corresponding":true,"raw_author_name":"Muhammad Salman Pathan","raw_affiliation_strings":["ADAPT SFI Research Centre, School of Computing, Dublin City University, Dublin, Ireland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ADAPT SFI Research Centre, School of Computing, Dublin City University, Dublin, Ireland","institution_ids":["https://openalex.org/I42934936"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5051139581"],"corresponding_institution_ids":["https://openalex.org/I42934936"],"apc_list":{"value":2390,"currency":"EUR","value_usd":2990},"apc_paid":{"value":2390,"currency":"EUR","value_usd":2990},"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93907866,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":"42","issue":"8","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9118000268936157,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9118000268936157,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.014499999582767487,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.014399999752640724,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5688999891281128},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5490000247955322},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4864000082015991},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.42320001125335693},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.39480000734329224},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.37470000982284546},{"id":"https://openalex.org/keywords/ground-truth","display_name":"Ground truth","score":0.35850000381469727},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.3490999937057495}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7396000027656555},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5976999998092651},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5688999891281128},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5490000247955322},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4864000082015991},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.43880000710487366},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.42320001125335693},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.39480000734329224},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.37470000982284546},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.35850000381469727},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.3490999937057495},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.3407000005245209},{"id":"https://openalex.org/C2779757391","wikidata":"https://www.wikidata.org/wiki/Q6002292","display_name":"Image translation","level":3,"score":0.3199999928474426},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.3061999976634979},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.29989999532699585},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2870999872684479},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.28209999203681946},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.2815999984741211},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.2614000141620636},{"id":"https://openalex.org/C150921843","wikidata":"https://www.wikidata.org/wiki/Q1170431","display_name":"Resampling","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C2779585090","wikidata":"https://www.wikidata.org/wiki/Q3457762","display_name":"Resilience (materials science)","level":2,"score":0.25440001487731934},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2540000081062317},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s00371-026-04480-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00371-026-04480-4","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00371-026-04480-4.pdf","source":{"id":"https://openalex.org/S73060445","display_name":"The Visual Computer","issn_l":"0178-2789","issn":["0178-2789","1432-2315"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Visual Computer","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s00371-026-04480-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00371-026-04480-4","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00371-026-04480-4.pdf","source":{"id":"https://openalex.org/S73060445","display_name":"The Visual Computer","issn_l":"0178-2789","issn":["0178-2789","1432-2315"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Visual Computer","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320320865","display_name":"Dublin City University","ror":"https://ror.org/04a1a1e81"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7161710229.pdf","grobid_xml":"https://content.openalex.org/works/W7161710229.grobid-xml"},"referenced_works_count":15,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2081580037","https://openalex.org/W2774644650","https://openalex.org/W2962847335","https://openalex.org/W2963857521","https://openalex.org/W4312326455","https://openalex.org/W4386076522","https://openalex.org/W4390190425","https://openalex.org/W4390873312","https://openalex.org/W4393148438","https://openalex.org/W4402727057","https://openalex.org/W4413144966","https://openalex.org/W4413402158","https://openalex.org/W7133220561","https://openalex.org/W7160280826"],"related_works":[],"abstract_inverted_index":{"Abstract":[0],"Large":[1],"Multimodal":[2],"Models":[3],"(LMMs)":[4],"have":[5],"achieved":[6],"remarkable":[7],"performance":[8],"across":[9,136,188],"vision-language":[10],"tasks,":[11],"yet":[12],"their":[13],"robustness":[14,118],"against":[15,119],"adversarial":[16,70,155],"attacks":[17],"remains":[18],"critically":[19],"underexplored.":[20],"While":[21],"LMMs":[22,195],"are":[23],"vulnerable":[24],"to":[25,34,41,43,81,134,160,199],"visual":[26],"encoder":[27,35],"attacks,":[28],"they":[29],"exhibit":[30],"surprising":[31],"resilience":[32],"due":[33],"diversity\u2014attacks":[36],"optimized":[37],"for":[38,237],"CLIP":[39],"fail":[40],"transfer":[42,212],"EVA-CLIP,":[44],"especially":[45],"when":[46],"textual":[47],"context":[48],"is":[49],"provided.":[50],"We":[51],"introduce":[52],"the":[53,83,164,189,234],"Adaptive":[54],"Ensemble":[55],"PGD":[56],"(AE-PGD)":[57],"attack,":[58],"which":[59],"simultaneously":[60],"targets":[61],"both":[62],"encoders":[63],"through":[64],"three":[65,138],"key":[66],"innovations:":[67],"(1)":[68],"dynamic":[69],"caption":[71,86],"selection":[72],",":[73,94],"combining":[74],"gradient":[75,103,115],"magnitude":[76],"with":[77,209],"global":[78],"semantic":[79,186],"displacement":[80],"identify":[82],"most":[84],"attack-effective":[85],"per":[87],"model;":[88],"(2)":[89],"an":[90,110],"adaptive":[91],"weight":[92],"controller":[93],"dynamically":[95],"balancing":[96],"each":[97],"encoder\u2019s":[98],"contribution":[99],"using":[100],"real-time":[101],"loss,":[102],"norm,":[104],"and":[105,108,144,173,232],"confidence":[106],"metrics;":[107],"(3)":[109],"Expectation":[111],"over":[112],"Transforms":[113],"(EoT)":[114],"update":[116],"ensuring":[117],"input-transformation":[120],"defenses.":[121],"Evaluated":[122],"on":[123,220],"COCO":[124],"2014":[125],"images,":[126],"AE-PGD":[127,192],"reduces":[128],"accuracy":[129],"from":[130],"a":[131,182,202,206,214],"75.42%":[132],"baseline":[133],"0.0%":[135],"all":[137],"evaluation":[139],"metrics\u2014visual":[140],"encoding,":[141],"image-to-text":[142],"recall,":[143],"LLM":[145],"answer":[146],"recall\u2014achieving":[147],"complete":[148],"model":[149],"collapse.":[150],"Manifold":[151],"analysis":[152,180],"confirms":[153],"that":[154],"perturbations":[156],"push":[157],"image":[158],"embeddings":[159],"antipodal":[161],"regions":[162],"of":[163],"joint":[165],"embedding":[166],"space,":[167],"activating":[168],"semantically":[169],"opposite":[170],"concept":[171],"clusters":[172],"producing":[174],"structured":[175],"hallucinations.":[176],"WordNet":[177],"WUP":[178],"similarity":[179],"reveals":[181],"33.5":[183],"percentage":[184,216],"point":[185,217],"drop":[187],"test":[190],"set.":[191],"causes":[193],"state-of-the-art":[194],"(LLaVA,":[196],"Qwen-VL,":[197],"GPT-4V)":[198],"catastrophically":[200],"misidentify":[201],"bullet":[203],"train":[204],"as":[205],"\u201chelicopter":[207],"crash,\u201d":[208],"strong":[210],"black-box":[211],"yielding":[213],"65":[215],"recall":[218],"collapse":[219],"unseen":[221],"encoders.":[222],"This":[223],"work":[224],"exposes":[225],"critical":[226],"vulnerabilities":[227],"in":[228],"current":[229],"LMM":[230],"architectures":[231],"underscores":[233],"urgent":[235],"need":[236],"ensemble-aware":[238],"defense":[239],"mechanisms.":[240]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2026-05-20T00:00:00"}
