{"id":"https://openalex.org/W7119487516","doi":"https://doi.org/10.48550/arxiv.2601.03265","title":"Jailbreak-Zero: A Path to Pareto Optimal Red Teaming for Large Language Models","display_name":"Jailbreak-Zero: A Path to Pareto Optimal Red Teaming for Large Language Models","publication_year":2025,"publication_date":"2025-12-18","ids":{"openalex":"https://openalex.org/W7119487516","doi":"https://doi.org/10.48550/arxiv.2601.03265"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.03265","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.03265","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.03265","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122317046","display_name":"Kai Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hu, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104083401","display_name":"Abhinav Aggarwal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aggarwal, Abhinav","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122326420","display_name":"Mehran Khodabandeh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khodabandeh, Mehran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122385733","display_name":"David Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122355857","display_name":"Eric Hsin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hsin, Eric","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122304120","display_name":"Li Tat John Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017781286","display_name":"Ankit Kumar Jain","orcid":"https://orcid.org/0000-0002-9482-6991"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jain, Ankit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122379377","display_name":"Matt Fredrikson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fredrikson, Matt","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5122355282","display_name":"Akash Bharadwaj","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bharadwaj, Akash","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5122317046"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5817999839782715,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5817999839782715,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.06430000066757202,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.06390000134706497,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.7161999940872192},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.65420001745224},{"id":"https://openalex.org/keywords/path","display_name":"Path (computing)","score":0.5306000113487244},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.49939998984336853},{"id":"https://openalex.org/keywords/expansive","display_name":"Expansive","score":0.4666000008583069},{"id":"https://openalex.org/keywords/pareto-optimal","display_name":"Pareto optimal","score":0.4417000114917755},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4278999865055084},{"id":"https://openalex.org/keywords/volume","display_name":"Volume (thermodynamics)","score":0.42160001397132874}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.7161999940872192},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7063000202178955},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.65420001745224},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.5306000113487244},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.49939998984336853},{"id":"https://openalex.org/C2780502288","wikidata":"https://www.wikidata.org/wiki/Q28838156","display_name":"Expansive","level":3,"score":0.4666000008583069},{"id":"https://openalex.org/C2986314615","wikidata":"https://www.wikidata.org/wiki/Q36829","display_name":"Pareto optimal","level":3,"score":0.4417000114917755},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4278999865055084},{"id":"https://openalex.org/C20556612","wikidata":"https://www.wikidata.org/wiki/Q4469374","display_name":"Volume (thermodynamics)","level":2,"score":0.42160001397132874},{"id":"https://openalex.org/C137635306","wikidata":"https://www.wikidata.org/wiki/Q182667","display_name":"Pareto principle","level":2,"score":0.4131999909877777},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.4099999964237213},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.3926999866962433},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.3693000078201294},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35420000553131104},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.34049999713897705},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.326200008392334},{"id":"https://openalex.org/C68781425","wikidata":"https://www.wikidata.org/wiki/Q2052203","display_name":"Multi-objective optimization","level":2,"score":0.295199990272522},{"id":"https://openalex.org/C2777742833","wikidata":"https://www.wikidata.org/wiki/Q1964083","display_name":"Reciprocal","level":2,"score":0.29100000858306885},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.29010000824928284},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.2856999933719635},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.25679999589920044},{"id":"https://openalex.org/C2779545769","wikidata":"https://www.wikidata.org/wiki/Q5135364","display_name":"Closeness","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.03265","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.03265","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.03265","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.03265","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.6894872784614563,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0],"paper":[1],"introduces":[2],"Jailbreak-Zero,":[3],"a":[4,21,26,40,54,129],"novel":[5],"red":[6],"teaming":[7],"methodology":[8],"that":[9],"shifts":[10],"the":[11,62,82,139],"paradigm":[12],"of":[13,43,65,84,142],"Large":[14],"Language":[15],"Model":[16],"(LLM)":[17],"safety":[18,140],"evaluation":[19],"from":[20],"constrained":[22],"example-based":[23],"approach":[24],"to":[25,38,74,106],"more":[27,130],"expansive":[28],"and":[29,47,71,96,101,117,132,137],"effective":[30,118],"policy-based":[31],"framework.":[32],"By":[33],"leveraging":[34],"an":[35],"attack":[36,51,68,90],"LLM":[37],"generate":[39],"high":[41],"volume":[42],"diverse":[44],"adversarial":[45,119],"prompts":[46,120],"then":[48],"fine-tuning":[49],"this":[50,85,113],"model":[52],"with":[53,121],"preference":[55],"dataset,":[56],"Jailbreak-Zero":[57,111],"achieves":[58],"Pareto":[59],"optimality":[60],"across":[61],"crucial":[63],"objectives":[64],"policy":[66],"coverage,":[67],"strategy":[69],"diversity,":[70],"prompt":[72],"fidelity":[73],"real":[75],"user":[76],"inputs.":[77],"The":[78],"empirical":[79],"evidence":[80],"demonstrates":[81],"superiority":[83],"method,":[86],"showcasing":[87],"significantly":[88],"higher":[89],"success":[91],"rates":[92],"against":[93],"both":[94],"open-source":[95],"proprietary":[97],"models":[98],"like":[99],"GPT-40":[100],"Claude":[102],"3.5":[103],"when":[104],"compared":[105],"existing":[107],"state-of-the-art":[108],"techniques.":[109],"Crucially,":[110],"accomplishes":[112],"while":[114],"producing":[115],"human-readable":[116],"minimal":[122],"need":[123],"for":[124,135],"human":[125],"intervention,":[126],"thereby":[127],"presenting":[128],"scalable":[131],"comprehensive":[133],"solution":[134],"identifying":[136],"mitigating":[138],"vulnerabilities":[141],"LLMs.":[143]},"counts_by_year":[],"updated_date":"2026-01-09T23:14:04.187858","created_date":"2026-01-09T00:00:00"}
