{"id":"https://openalex.org/W7128627934","doi":"https://doi.org/10.48550/arxiv.2602.09431","title":"Understanding and Enhancing Encoder-based Adversarial Transferability against Large Vision-Language Models","display_name":"Understanding and Enhancing Encoder-based Adversarial Transferability against Large Vision-Language Models","publication_year":2026,"publication_date":"2026-02-10","ids":{"openalex":"https://openalex.org/W7128627934","doi":"https://doi.org/10.48550/arxiv.2602.09431"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.09431","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125657238","display_name":"Xinwei Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Xinwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125625266","display_name":"Li Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125654790","display_name":"Tianwei Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Tianwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016761774","display_name":"Youqian Zhang","orcid":"https://orcid.org/0000-0003-0907-7998"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Youqian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125653391","display_name":"Qingqing Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Qingqing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125680146","display_name":"Yingnan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yingnan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107018562","display_name":"Ruochen Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Ruochen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125651304","display_name":"Haibo Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Haibo","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5125657238"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9688000082969666,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9688000082969666,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.005200000014156103,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0038999998942017555,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.8482000231742859},{"id":"https://openalex.org/keywords/transferability","display_name":"Transferability","score":0.8127999901771545},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.6230999827384949},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5333999991416931},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.46700000762939453},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.41690000891685486},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4117000102996826},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3995000123977661}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.8482000231742859},{"id":"https://openalex.org/C61272859","wikidata":"https://www.wikidata.org/wiki/Q7834031","display_name":"Transferability","level":3,"score":0.8127999901771545},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7757999897003174},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.6230999827384949},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5333999991416931},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5322999954223633},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.46700000762939453},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4564000070095062},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.41690000891685486},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4117000102996826},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3995000123977661},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.39809998869895935},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.39309999346733093},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3684000074863434},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.33959999680519104},{"id":"https://openalex.org/C84945661","wikidata":"https://www.wikidata.org/wiki/Q7366567","display_name":"Root cause","level":2,"score":0.33869999647140503},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.31459999084472656},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.2870999872684479},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2858000099658966},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2784999907016754},{"id":"https://openalex.org/C171078966","wikidata":"https://www.wikidata.org/wiki/Q111029","display_name":"Root (linguistics)","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25200000405311584},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.09431","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.09431","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.09431","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.09431","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.7874546051025391,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"vision-language":[1],"models":[2,122,190],"(LVLMs)":[3],"have":[4],"achieved":[5],"impressive":[6],"success":[7],"across":[8,53,118,141,187],"multimodal":[9,218],"tasks,":[10],"but":[11],"their":[12,51,124],"reliance":[13],"on":[14,33,126],"visual":[15,116],"inputs":[16],"exposes":[17],"them":[18],"to":[19,47,156],"significant":[20],"adversarial":[21,76],"threats.":[22],"Existing":[23],"encoder-based":[24,75],"attacks":[25,96],"perturb":[26],"the":[27,34,39,70,112,158,162,213],"input":[28],"image":[29],"by":[30,161],"optimizing":[31],"solely":[32],"vision":[35],"encoder,":[36],"rather":[37],"than":[38,199],"entire":[40],"LVLM,":[41],"offering":[42],"a":[43,136,153],"computationally":[44],"efficient":[45],"alternative":[46],"end-to-end":[48],"optimization.":[49],"However,":[50],"transferability":[52,77,198],"different":[54,121,188],"LVLM":[55,209],"architectures":[56],"in":[57,78,165,208],"realistic":[58],"black-box":[59],"scenarios":[60],"remains":[61],"poorly":[62],"understood.":[63],"To":[64],"address":[65],"this":[66],"gap,":[67],"we":[68,92,102,147],"present":[69],"first":[71],"systematic":[72],"study":[73],"towards":[74],"LVLMs.":[79],"Our":[80],"contributions":[81],"are":[82],"threefold.":[83],"First,":[84],"through":[85],"large-scale":[86],"benchmarking":[87],"over":[88],"eight":[89],"diverse":[90],"LVLMs,":[91],"reveal":[93],"that":[94,110,194],"existing":[95,200],"exhibit":[97],"severely":[98],"limited":[99],"transferability.":[100,159],"Second,":[101],"perform":[103],"in-depth":[104],"analysis,":[105,167],"disclosing":[106],"two":[107],"root":[108],"causes":[109,164],"hinder":[111],"transferability:":[113],"(1)":[114],"inconsistent":[115],"grounding":[117,178],"models,":[119,134],"where":[120,135],"focus":[123],"attention":[125],"distinct":[127],"regions;":[128],"(2)":[129],"redundant":[130],"semantic":[131],"alignment":[132],"within":[133],"single":[137],"object":[138],"is":[139],"dispersed":[140],"multiple":[142],"overlapping":[143],"token":[144],"representations.":[145],"Third,":[146],"propose":[148],"Semantic-Guided":[149],"Multimodal":[150],"Attack":[151],"(SGMA),":[152],"novel":[154],"framework":[155],"enhance":[157],"Inspired":[160],"discovered":[163],"our":[166],"SGMA":[168,195],"directs":[169],"perturbations":[170],"toward":[171],"semantically":[172],"critical":[173,205],"regions":[174],"and":[175,182,191,211],"disrupts":[176],"cross-modal":[177],"at":[179],"both":[180],"global":[181],"local":[183],"levels.":[184],"Extensive":[185],"experiments":[186],"victim":[189],"tasks":[192],"show":[193],"achieves":[196],"higher":[197],"attacks.":[201],"These":[202],"results":[203],"expose":[204],"security":[206],"risks":[207],"deployment":[210],"underscore":[212],"urgent":[214],"need":[215],"for":[216],"robust":[217],"defenses.":[219]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-12T00:00:00"}
