{"id":"https://openalex.org/W7139935053","doi":"https://doi.org/10.48550/arxiv.2603.19127","title":"On Optimizing Multimodal Jailbreaks for Spoken Language Models","display_name":"On Optimizing Multimodal Jailbreaks for Spoken Language Models","publication_year":2026,"publication_date":"2026-03-19","ids":{"openalex":"https://openalex.org/W7139935053","doi":"https://doi.org/10.48550/arxiv.2603.19127"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.19127","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19127","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.19127","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130221365","display_name":"Aravind Krishnan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Krishnan, Aravind","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130225305","display_name":"Karolina Sta\u0144czak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sta\u0144czak, Karolina","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130216201","display_name":"Dietrich Klakow","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Klakow, Dietrich","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5130221365"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9078999757766724,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9078999757766724,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.016699999570846558,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.016200000420212746,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.6284000277519226},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.5498999953269958},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.5426999926567078},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.5199999809265137},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.478300005197525},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4350999891757965}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7814000248908997},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.6284000277519226},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.5498999953269958},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.5426999926567078},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.5199999809265137},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.498199999332428},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49079999327659607},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.478300005197525},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4422999918460846},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4350999891757965},{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.2623000144958496},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.26159998774528503},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.25839999318122864}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.19127","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19127","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.19127","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19127","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7912678122520447,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"Spoken":[1],"Language":[2],"Models":[3],"(SLMs)":[4],"integrate":[5],"speech":[6],"and":[7,19,77,94,118,144],"text":[8,48,76],"modalities,":[9],"they":[10],"inherit":[11],"the":[12,111],"safety":[13,136],"vulnerabilities":[14],"of":[15,114],"their":[16],"LLM":[17],"backbone":[18],"an":[20],"expanded":[21],"attack":[22,117],"surface.":[23],"SLMs":[24,93],"have":[25],"been":[26],"previously":[27],"shown":[28],"to":[29,32,84,107,128],"be":[30],"susceptible":[31],"jailbreaking,":[33],"where":[34],"adversarial":[35],"prompts":[36],"induce":[37],"harmful":[38],"responses.":[39],"Yet":[40],"existing":[41],"attacks":[42],"largely":[43],"remain":[44],"unimodal,":[45],"optimizing":[46],"either":[47],"or":[49],"audio":[50,96],"in":[51],"isolation.":[52],"We":[53,109],"explore":[54],"gradient-based":[55],"multimodal":[56,67],"jailbreaks":[57],"by":[58,105],"introducing":[59],"JAMA":[60,100],"(Joint":[61],"Audio-text":[62],"Multimodal":[63],"Attack),":[64],"a":[65,121],"joint":[66,116],"optimization":[68],"framework":[69],"combining":[70],"Greedy":[71],"Coordinate":[72],"Gradient":[73,79],"(GCG)":[74],"for":[75,82,139],"Projected":[78],"Descent":[80],"(PGD)":[81],"audio,":[83],"simultaneously":[85],"perturb":[86],"both":[87],"modalities.":[88],"Evaluations":[89],"across":[90],"four":[91,95],"state-of-the-art":[92],"types":[97],"demonstrate":[98],"that":[99,120,134],"surpasses":[101],"unimodal":[102,135],"jailbreak":[103],"rate":[104],"1.5x":[106],"10x.":[108],"analyze":[110],"operational":[112],"dynamics":[113],"this":[115],"show":[119],"sequential":[122],"approximation":[123],"method":[124],"makes":[125],"it":[126],"4x":[127],"6x":[129],"faster.":[130],"Our":[131],"findings":[132],"suggest":[133],"is":[137],"insufficient":[138],"robust":[140],"SLMs.":[141],"The":[142],"code":[143],"data":[145],"are":[146],"available":[147],"at":[148],"https://repos.lsv.uni-saarland.de/akrishnan/multimodal-jailbreak-slm":[149]},"counts_by_year":[],"updated_date":"2026-03-21T06:36:02.116451","created_date":"2026-03-21T00:00:00"}
