{"id":"https://openalex.org/W7117576490","doi":"https://doi.org/10.1145/3733799.3762966","title":"Rethinking How to Evaluate Language Model Jailbreak","display_name":"Rethinking How to Evaluate Language Model Jailbreak","publication_year":2025,"publication_date":"2025-10-13","ids":{"openalex":"https://openalex.org/W7117576490","doi":"https://doi.org/10.1145/3733799.3762966"},"language":null,"primary_location":{"id":"doi:10.1145/3733799.3762966","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3733799.3762966","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 18th ACM Workshop on Artificial Intelligence and Security","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Hongyu Cai","orcid":"https://orcid.org/0000-0001-9280-8493"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hongyu Cai","raw_affiliation_strings":["Purdue University, West Lafayette, IN, USA"],"raw_orcid":"https://orcid.org/0000-0001-9280-8493","affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, IN, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093013828","display_name":"Arjun Arunasalam","orcid":"https://orcid.org/0009-0001-1631-6064"},"institutions":[{"id":"https://openalex.org/I19700959","display_name":"Florida International University","ror":"https://ror.org/02gz6gg07","country_code":"US","type":"education","lineage":["https://openalex.org/I19700959"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Arjun Arunasalam","raw_affiliation_strings":["Florida International University, Miami, FL, USA"],"raw_orcid":"https://orcid.org/0009-0001-1631-6064","affiliations":[{"raw_affiliation_string":"Florida International University, Miami, FL, USA","institution_ids":["https://openalex.org/I19700959"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121552075","display_name":"Leo Y. Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Leo Y. Lin","raw_affiliation_strings":["Purdue University, West Lafayette, IN, USA"],"raw_orcid":"https://orcid.org/0009-0009-1561-0729","affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, IN, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121542937","display_name":"Antonio Bianchi","orcid":null},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Antonio Bianchi","raw_affiliation_strings":["Purdue University, West Lafayette, IN, USA"],"raw_orcid":"https://orcid.org/0000-0002-2862-5286","affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, IN, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5005376753","display_name":"Z. Berkay Celik","orcid":"https://orcid.org/0000-0001-7362-8905"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Z. Berkay Celik","raw_affiliation_strings":["Purdue University, West Lafayette, IN, USA"],"raw_orcid":"https://orcid.org/0000-0001-7362-8905","affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, IN, USA","institution_ids":["https://openalex.org/I219193219"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.78289323,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"52","last_page":"63"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.21950000524520874,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.21950000524520874,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.18770000338554382,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.13809999823570251,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.6284000277519226},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5493000149726868},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.49799999594688416},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4878999888896942},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.48730000853538513},{"id":"https://openalex.org/keywords/string","display_name":"String (physics)","score":0.4765999913215637},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4763999879360199},{"id":"https://openalex.org/keywords/clarity","display_name":"CLARITY","score":0.45649999380111694},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4564000070095062}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7405999898910522},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.6284000277519226},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5493000149726868},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.49799999594688416},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4878999888896942},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.48730000853538513},{"id":"https://openalex.org/C157486923","wikidata":"https://www.wikidata.org/wiki/Q1376436","display_name":"String (physics)","level":2,"score":0.4765999913215637},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4763999879360199},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.460099995136261},{"id":"https://openalex.org/C2777146004","wikidata":"https://www.wikidata.org/wiki/Q14949826","display_name":"CLARITY","level":2,"score":0.45649999380111694},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4564000070095062},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.44749999046325684},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.4268999993801117},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39469999074935913},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3774000108242035},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.3614000082015991},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.3603000044822693},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3508000075817108},{"id":"https://openalex.org/C2777206241","wikidata":"https://www.wikidata.org/wiki/Q194431","display_name":"Paragraph","level":2,"score":0.3409999907016754},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.31690001487731934},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.301800012588501},{"id":"https://openalex.org/C2780771206","wikidata":"https://www.wikidata.org/wiki/Q3271761","display_name":"Safeguard","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C2777515626","wikidata":"https://www.wikidata.org/wiki/Q496939","display_name":"Levenshtein distance","level":2,"score":0.28189998865127563},{"id":"https://openalex.org/C148524875","wikidata":"https://www.wikidata.org/wiki/Q6975395","display_name":"F1 score","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.26179999113082886},{"id":"https://openalex.org/C482391","wikidata":"https://www.wikidata.org/wiki/Q101244","display_name":"Acronym","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C98183937","wikidata":"https://www.wikidata.org/wiki/Q2112188","display_name":"Program analysis","level":2,"score":0.25270000100135803},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3733799.3762966","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3733799.3762966","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 18th ACM Workshop on Artificial Intelligence and Security","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.8069772720336914}],"awards":[{"id":"https://openalex.org/G6062656956","display_name":null,"funder_award_id":"IIS-2229876","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W3170572542","https://openalex.org/W4243989635","https://openalex.org/W4389524506","https://openalex.org/W4405181600","https://openalex.org/W4412073525"],"related_works":[],"abstract_inverted_index":{"Developers":[0],"implement":[1],"safeguards":[2,30],"for":[3,121,242],"large":[4],"language":[5,54,191,251],"models":[6],"(LLMs)":[7],"to":[8,27,184,249],"restrict":[9],"specific":[10],"responses":[11],"and":[12,52,84,94,128,140,177,180,188,232,244,254],"prevent":[13],"the":[14,62,65,89,103,142,149,172,175,198,240,257],"generation":[15],"of":[16,23,40,91,119,144,151,229,259],"unsafe":[17],"content.":[18,37],"However,":[19],"a":[20,107,116,205],"growing":[21],"number":[22],"jailbreak":[24,75,99,104,159,215,221,246],"approaches":[25,42],"aim":[26],"circumvent":[28],"these":[29,41,152],"by":[31],"crafting":[32],"prompts":[33],"that":[34],"elicit":[35],"prohibited":[36],"The":[38],"success":[39],"is":[43],"typically":[44],"evaluated":[45],"using":[46],"techniques":[47],"such":[48],"as":[49],"string":[50],"matching":[51],"natural":[53,190],"understanding,":[55],"which":[56,170,193],"yield":[57],"an":[58,156],"outcome":[59],"indicating":[60],"whether":[61],"response":[63,100,168,173],"violates":[64],"safeguard.":[66],"Our":[67],"analysis":[68],"reveals":[69],"two":[70,164],"key":[71],"limitations":[72],"in":[73],"existing":[74,220],"evaluation":[76,80,160,222,247],"methodologies:":[77],"(1)":[78,166],"their":[79,145],"objectives":[81],"lack":[82],"clarity":[83],"do":[85],"not":[86],"directly":[87],"assess":[88,256],"safety":[90,253],"generated":[92],"responses,":[93],"(2)":[95,189],"they":[96],"conflate":[97],"different":[98],"outcomes,":[101],"oversimplifying":[102],"result":[105],"into":[106],"binary":[108],"success/failure":[109],"outcome.":[110],"In":[111],"this":[112],"paper,":[113],"we":[114,154],"propose":[115],"novel":[117],"set":[118],"metrics":[120,132,248],"evaluating":[122],"LLM":[123,167],"jailbreaks:":[124],"safeguard":[125],"violation,":[126],"informativeness,":[127],"relative":[129],"truthfulness.":[130],"These":[131,237],"can":[133],"differentiate":[134],"between":[135],"attackers":[136],"with":[137],"varying":[138],"goals":[139,258],"capture":[141],"nuances":[143],"motivations.":[146],"To":[147],"automate":[148],"computation":[150],"metrics,":[153],"introduce":[155],"automated":[157],"multi-metric":[158],"framework":[161],"(amJE)":[162],"comprising":[163],"components:":[165],"preprocessing,":[169],"tokenizes":[171],"at":[174],"paragraph":[176],"sentence":[178],"levels":[179],"removes":[181],"invalid":[182],"segments":[183],"enhance":[185],"metric":[186,196],"accuracy,":[187],"generation,":[192],"evaluates":[194],"each":[195],"from":[197,213],"processed":[199],"responses.":[200],"We":[201],"evaluate":[202],"amJE":[203,217],"on":[204],"benchmark":[206],"dataset":[207],"containing":[208],"250":[209],"malicious":[210,260],"intents":[211],"constructed":[212],"recent":[214],"approaches.":[216],"outperforms":[218],"three":[219,235],"methods,":[223],"achieving":[224],"average":[225],"F1":[226],"score":[227],"improvements":[228],"45%,":[230],"322%,":[231],"12%":[233],"over":[234],"baselines.":[236],"findings":[238],"highlight":[239],"need":[241],"sound":[243],"precise":[245],"ensure":[250],"model":[252],"accurately":[255],"actors.":[261]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-12-30T00:00:00"}
