{"id":"https://openalex.org/W7140123232","doi":"https://doi.org/10.18653/v1/2026.eacl-long.83","title":"Unraveling LLM Jailbreaks Through Safety Knowledge Neurons","display_name":"Unraveling LLM Jailbreaks Through Safety Knowledge Neurons","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7140123232","doi":"https://doi.org/10.18653/v1/2026.eacl-long.83"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2026.eacl-long.83","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2026.eacl-long.83","pdf_url":"https://aclanthology.org/2026.eacl-long.83.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2026.eacl-long.83.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122722849","display_name":"Chongwen Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chongwen Zhao","raw_affiliation_strings":["Duke Kunshan University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Duke Kunshan University","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130375035","display_name":"Yutong Ke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yutong Ke","raw_affiliation_strings":["Duke Kunshan University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Duke Kunshan University","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103236934","display_name":"Kaizhu Huang","orcid":"https://orcid.org/0000-0003-4644-3037"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaizhu Huang","raw_affiliation_strings":["Duke Kunshan University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Duke Kunshan University","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.39869956,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1889","last_page":"1906"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.6067000031471252,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.6067000031471252,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.06520000100135803,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.0575999990105629,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/nervous-system","display_name":"Nervous system","score":0.2865999937057495},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.251800000667572},{"id":"https://openalex.org/keywords/disease","display_name":"Disease","score":0.2517000138759613},{"id":"https://openalex.org/keywords/medline","display_name":"MEDLINE","score":0.2257000058889389},{"id":"https://openalex.org/keywords/central-nervous-system","display_name":"Central nervous system","score":0.22120000422000885}],"concepts":[{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.45080000162124634},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.4027000069618225},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3330000042915344},{"id":"https://openalex.org/C545706735","wikidata":"https://www.wikidata.org/wiki/Q9404","display_name":"Nervous system","level":2,"score":0.2865999937057495},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.251800000667572},{"id":"https://openalex.org/C2779134260","wikidata":"https://www.wikidata.org/wiki/Q12136","display_name":"Disease","level":2,"score":0.2517000138759613},{"id":"https://openalex.org/C2779473830","wikidata":"https://www.wikidata.org/wiki/Q1540899","display_name":"MEDLINE","level":2,"score":0.2257000058889389},{"id":"https://openalex.org/C529278444","wikidata":"https://www.wikidata.org/wiki/Q47273","display_name":"Central nervous system","level":2,"score":0.22120000422000885},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.22040000557899475},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.21979999542236328}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2026.eacl-long.83","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2026.eacl-long.83","pdf_url":"https://aclanthology.org/2026.eacl-long.83.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2026.eacl-long.83","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2026.eacl-long.83","pdf_url":"https://aclanthology.org/2026.eacl-long.83.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1698772872","display_name":null,"funder_award_id":"92370119","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5866557873","display_name":null,"funder_award_id":"62376113","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7140123232.pdf","grobid_xml":"https://content.openalex.org/works/W7140123232.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4,32],"achieved":[5,33],"substantial":[6],"progress":[7],"in":[8],"alignment,":[9],"ensuring":[10],"safer":[11],"and":[12,23,83,136,149,160],"more":[13,81],"reliable":[14],"outputs.However,":[15],"jailbreak":[16,36,152],"attacks":[17,37],"can":[18,96],"still":[19,49],"bypass":[20],"these":[21],"safeguards":[22],"provoke":[24],"harmful":[25,44],"responses":[26],"from":[27],"well-aligned":[28],"models.While":[29],"some":[30],"studies":[31],"defenses":[34],"against":[35,126,151],"by":[38],"modifying":[39],"output":[40],"distributions":[41],"or":[42],"detecting":[43],"content,":[45],"the":[46,64,75,91,99],"exact":[47],"rationale":[48],"remains":[50],"elusive.In":[51],"this":[52,110,154],"work,":[53],"we":[54,112],"present":[55],"a":[56,80,103,115,144],"novel":[57],"neuron-level":[58],"interpretability":[59],"method":[60,73],"that":[61,89,118],"focuses":[62],"on":[63,109,147],"role":[65],"of":[66,93],"safety-related":[67,94],"knowledge":[68],"neurons.Unlike":[69],"existing":[70],"approaches,":[71],"our":[72],"projects":[74],"model's":[76],"internal":[77],"representation":[78],"into":[79],"consistent":[82],"interpretable":[84],"vocabulary":[85],"space.We":[86],"then":[87],"show":[88],"adjusting":[90],"activation":[92],"neurons":[95,121],"effectively":[97],"control":[98],"models":[100],"behavior":[101],"with":[102],"mean":[104],"ASR":[105],"higher":[106],"than":[107],"97%.Building":[108],"insight,":[111],"propose":[113],"SafeTuning,":[114],"fine-tuning":[116],"strategy":[117],"reinforces":[119],"safety-critical":[120],"to":[122],"improve":[123],"model":[124],"robustness":[125],"jailbreaks.Safe-Tuning":[127],"consistently":[128],"reduces":[129],"attack":[130],"success":[131],"rates":[132],"across":[133],"multiple":[134],"LLMs":[135],"outperforms":[137],"all":[138],"four":[139],"baseline":[140],"defenses.These":[141],"findings":[142],"offer":[143],"new":[145],"perspective":[146],"understanding":[148],"defending":[150],"attacks.Warning:":[153],"paper":[155],"may":[156],"contain":[157],"offensive":[158],"prompts":[159],"outputs.":[161]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-24T00:00:00"}
