{"id":"https://openalex.org/W7128790345","doi":"https://doi.org/10.48550/arxiv.2602.11157","title":"Response-Based Knowledge Distillation for Multilingual Jailbreak Prevention Unwittingly Compromises Safety","display_name":"Response-Based Knowledge Distillation for Multilingual Jailbreak Prevention Unwittingly Compromises Safety","publication_year":2025,"publication_date":"2025-12-08","ids":{"openalex":"https://openalex.org/W7128790345","doi":"https://doi.org/10.48550/arxiv.2602.11157"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.11157","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125912365","display_name":"Max Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Max","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125954089","display_name":"Derek Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Derek","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125889948","display_name":"Kai Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125907348","display_name":"Joshua Franco","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Franco, Joshua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125895538","display_name":"Haihao Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Haihao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5125912365"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1551000028848648,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1551000028848648,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.09839999675750732,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.09350000321865082,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.659500002861023},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5378999710083008},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.48089998960494995},{"id":"https://openalex.org/keywords/counterintuitive","display_name":"Counterintuitive","score":0.47780001163482666},{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.4142000079154968},{"id":"https://openalex.org/keywords/plan","display_name":"Plan (archaeology)","score":0.40380001068115234},{"id":"https://openalex.org/keywords/knowledge-base","display_name":"Knowledge base","score":0.3982999920845032}],"concepts":[{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.659500002861023},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6236000061035156},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5378999710083008},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.48089998960494995},{"id":"https://openalex.org/C101097943","wikidata":"https://www.wikidata.org/wiki/Q5176983","display_name":"Counterintuitive","level":2,"score":0.47780001163482666},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.4142000079154968},{"id":"https://openalex.org/C2776505523","wikidata":"https://www.wikidata.org/wiki/Q4785468","display_name":"Plan (archaeology)","level":2,"score":0.40380001068115234},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.3982999920845032},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.3928999900817871},{"id":"https://openalex.org/C85973986","wikidata":"https://www.wikidata.org/wiki/Q1091731","display_name":"Exploratory research","level":2,"score":0.39010000228881836},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3817000091075897},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.32600000500679016},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3059000074863434},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2856999933719635},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2849999964237213},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.28290000557899475},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2745000123977661},{"id":"https://openalex.org/C156325361","wikidata":"https://www.wikidata.org/wiki/Q1152864","display_name":"Grounded theory","level":3,"score":0.26080000400543213},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2603999972343445},{"id":"https://openalex.org/C190248442","wikidata":"https://www.wikidata.org/wiki/Q839486","display_name":"Qualitative research","level":2,"score":0.25369998812675476}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.11157","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.11157","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.11157","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.11157","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.6865071654319763,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"are":[4],"increasingly":[5],"deployed":[6],"worldwide,":[7],"yet":[8],"their":[9],"safety":[10,140,150,178],"alignment":[11],"remains":[12],"predominantly":[13],"English-centric.":[14],"This":[15],"allows":[16],"for":[17,106,176,183],"vulnerabilities":[18],"in":[19,35,152,157,186],"non-English":[20],"contexts,":[21],"especially":[22],"with":[23,57,126],"low-resource":[24],"languages.":[25],"We":[26,45],"introduce":[27],"a":[28,51,89,118,136,174,181],"novel":[29],"application":[30],"of":[31,38,50,139,171],"knowledge":[32],"distillation":[33],"(KD)":[34],"the":[36,47,85,95,131,167],"context":[37],"multilingual":[39,72,177],"jailbreak":[40,73],"prevention,":[41],"examining":[42],"its":[43],"efficacy.":[44],"distill":[46],"refusal":[48,98],"behaviors":[49],"proprietary":[52],"teacher":[53],"model":[54],"(OpenAI":[55],"o1-mini)":[56],"Low-Rank":[58],"Adaptation":[59],"(LoRA)":[60],"into":[61],"three":[62],"open-source":[63],"student":[64,108,153],"models:":[65],"Meta-Llama-3-8B-Instruct,":[66],"Gemma-2-2B-IT,":[67],"and":[68,169],"Qwen3-8B,":[69],"using":[70],"~28,000":[71],"prompts":[74],"from":[75],"XSafety":[76],"via":[77],"black-box":[78],"response-based,":[79],"parameter-efficient":[80],"fine-tuning":[81,93],"(PEFT).":[82],"Evaluation":[83],"on":[84,94,130],"MultiJail":[86],"benchmark":[87],"reveals":[88],"counterintuitive":[90],"behavior:":[91],"standard":[92],"teacher's":[96],"``safe''":[97],"data":[99],"inadvertently":[100],"increases":[101],"Jailbreak":[102],"Success":[103],"Rate":[104],"(JSR)":[105],"all":[107],"models,":[109,154],"up":[110],"to":[111,121],"16.6":[112],"percentage":[113],"points.":[114],"Our":[115],"experiments":[116],"reveal":[117],"divergent":[119],"generalization":[120],"unseen":[122],"languages":[123],"during":[124],"distillation,":[125],"varying":[127],"outcomes":[128],"depending":[129],"base":[132],"model.":[133],"By":[134],"removing":[135],"primary":[137],"source":[138],"degradation,":[141],"nuanced":[142],"`boundary'":[143],"refusals,":[144],"we":[145],"mitigate":[146],"or":[147],"even":[148],"reverse":[149],"declines":[151],"although":[155],"reductions":[156],"reasoning":[158],"performance":[159],"(GSM8K)":[160],"persist.":[161],"Overall,":[162],"our":[163],"exploratory":[164],"study":[165],"highlights":[166],"challenges":[168],"potential":[170],"KD":[172],"as":[173],"technique":[175],"alignment,":[179],"offering":[180],"foundation":[182],"future":[184],"research":[185],"this":[187],"direction.":[188]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-14T00:00:00"}
