{"id":"https://openalex.org/W4407631956","doi":"https://doi.org/10.48550/arxiv.2502.09638","title":"Jailbreaking to Jailbreak","display_name":"Jailbreaking to Jailbreak","publication_year":2025,"publication_date":"2025-02-09","ids":{"openalex":"https://openalex.org/W4407631956","doi":"https://doi.org/10.48550/arxiv.2502.09638"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2502.09638","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.09638","pdf_url":"https://arxiv.org/pdf/2502.09638","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2502.09638","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5116082873","display_name":"Jeremy Kritz","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kritz, Jeremy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116292983","display_name":"Vaughn Robinson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Robinson, Vaughn","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116292984","display_name":"Robert Vacareanu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vacareanu, Robert","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116292985","display_name":"Bijan Varjavand","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Varjavand, Bijan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116292986","display_name":"Michael Choi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choi, Michael","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116292987","display_name":"Bobby Gogov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gogov, Bobby","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114601391","display_name":"Scale Red Team","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Team, Scale Red","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050610019","display_name":"Summer Yue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yue, Summer","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116082874","display_name":"Willow E. Primack","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Primack, Willow E.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101622957","display_name":"Zifan Wang","orcid":"https://orcid.org/0000-0002-8961-4302"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zifan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5116082873"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.8119999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.8119999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12519","display_name":"Cybercrime and Law Enforcement Studies","score":0.7609999775886536,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10751","display_name":"Forensic and Genetic Research","score":0.7584999799728394,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/business","display_name":"Business","score":0.35053473711013794}],"concepts":[{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.35053473711013794}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2502.09638","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.09638","pdf_url":"https://arxiv.org/pdf/2502.09638","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2502.09638","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2502.09638","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2502.09638","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.09638","pdf_url":"https://arxiv.org/pdf/2502.09638","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4407631956.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2765153054","https://openalex.org/W2596173151","https://openalex.org/W3213789065","https://openalex.org/W3080576469","https://openalex.org/W4231340554","https://openalex.org/W1491839574","https://openalex.org/W2931838652","https://openalex.org/W3168296622","https://openalex.org/W4399331938"],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"can":[4,59,112],"be":[5],"used":[6,97,144],"to":[7,14,42,98,139],"red":[8,78,162],"team":[9],"other":[10],"models":[11,24,28,66],"(e.g.":[12,38],"jailbreaking)":[13],"elicit":[15],"harmful":[16],"contents.":[17],"While":[18],"prior":[19],"works":[20],"commonly":[21],"employ":[22],"open-weight":[23],"or":[25,74],"private":[26],"uncensored":[27],"for":[29],"doing":[30,81],"jailbreaking,":[31,44],"as":[32,132],"the":[33,62,124,146,166,183],"refusal-training":[34],"of":[35,64,116,148,182],"strong":[36,86,135],"LLMs":[37,51],"OpenAI":[39],"o3)":[40],"refuse":[41],"help":[43],"our":[45],"work":[46],"turn":[47],"(almost)":[48],"any":[49],"black-box":[50,106],"into":[52],"attackers.":[53],"The":[54],"resulting":[55],"$J_2$":[56,100,110,136,150,171,173],"(jailbreaking-to-jailbreak)":[57],"attackers":[58,101,137],"effectively":[60],"jailbreak":[61,113],"safeguard":[63,147],"target":[65],"using":[67],"various":[68],"strategies,":[69],"both":[70],"created":[71],"by":[72],"themselves":[73],"from":[75],"expert":[76,160],"human":[77,161],"teamers.":[79],"In":[80],"so,":[82],"we":[83],"show":[84],"their":[85],"but":[87],"under-researched":[88],"jailbreaking":[89],"capabilities.":[90],"Our":[91],"experiments":[92],"demonstrate":[93],"that":[94],"1)":[95],"prompts":[96],"create":[99],"transfer":[102],"across":[103],"almost":[104],"all":[105],"models;":[107],"2)":[108],"an":[109],"attacker":[111],"a":[114],"copy":[115],"itself,":[117],"and":[118,164],"this":[119],"vulnerability":[120],"develops":[121],"rapidly":[122],"over":[123],"past":[125],"12":[126],"months;":[127],"3)":[128],"reasong":[129],"models,":[130],"such":[131],"Sonnet-3.7,":[133],"are":[134],"compared":[138],"others.":[140],"For":[141],"example,":[142],"when":[143],"against":[145,179],"GPT-4o,":[149],"(Sonnet-3.7)":[151],"achieves":[152,175],"0.975":[153],"attack":[154],"success":[155],"rate":[156],"(ASR),":[157],"which":[158],"matches":[159],"teamers":[163],"surpasses":[165],"state-of-the-art":[167],"algorithm-based":[168],"attacks.":[169],"Among":[170],"attackers,":[172],"(o3)":[174],"highest":[176],"ASR":[177],"(0.605)":[178],"Sonnet-3.5,":[180],"one":[181],"most":[184],"robust":[185],"models.":[186]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
