{"id":"https://openalex.org/W7160945588","doi":"https://doi.org/10.48550/arxiv.2605.08513","title":"A Single Neuron Is Sufficient to Bypass Safety Alignment in Large Language Models","display_name":"A Single Neuron Is Sufficient to Bypass Safety Alignment in Large Language Models","publication_year":2026,"publication_date":"2026-05-08","ids":{"openalex":"https://openalex.org/W7160945588","doi":"https://doi.org/10.48550/arxiv.2605.08513"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.08513","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08513","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.08513","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057802773","display_name":"Hamid Kazemi","orcid":"https://orcid.org/0000-0003-2590-1051"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kazemi, Hamid","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018206853","display_name":"Atoosa Chegini","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chegini, Atoosa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135945849","display_name":"Maria Safi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Safi, Maria","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.438400000333786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.438400000333786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.23669999837875366,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.1265999972820282,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.6309000253677368},{"id":"https://openalex.org/keywords/neuron","display_name":"Neuron","score":0.461899995803833},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3955000042915344},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.2791999876499176}],"concepts":[{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.6309000253677368},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6093999743461609},{"id":"https://openalex.org/C2778794669","wikidata":"https://www.wikidata.org/wiki/Q43054","display_name":"Neuron","level":2,"score":0.461899995803833},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.4341999888420105},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3955000042915344},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3617999851703644},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.31310001015663147},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.30489999055862427},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.2833000123500824},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.2791999876499176},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2621000111103058}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.08513","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08513","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.08513","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08513","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.6137955188751221,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Safety":[0],"alignment":[1,84,118],"in":[2,34],"language":[3],"models":[4,64],"operates":[5],"through":[6],"two":[7,66],"mechanistically":[8],"distinct":[9],"systems:":[10],"refusal":[11,105,114],"neurons":[12,22,97,115],"that":[13,23,82,98],"gate":[14,104],"whether":[15],"harmful":[16,26,48,54,121],"knowledge":[17,27],"is":[18,85,93],"expressed,":[19],"and":[20,52,68],"concept":[21],"encode":[24],"the":[25,112],"itself.":[28],"By":[29],"targeting":[30],"a":[31],"single":[32],"neuron":[33],"each":[35,100],"system,":[36],"we":[37],"demonstrate":[38],"both":[39],"directions":[40],"of":[41,111],"failure":[42],"--":[43,61,107],"bypassing":[44],"safety":[45,83,117],"on":[46],"explicit":[47],"requests":[49],"via":[50,59],"suppression,":[51],"inducing":[53],"content":[55],"from":[56],"innocent":[57],"prompts":[58],"amplification":[60],"across":[62,89,119],"seven":[63],"spanning":[65],"families":[67],"1.7B":[69],"to":[70,103],"70B":[71],"parameters,":[72],"without":[73],"any":[74,109],"training":[75],"or":[76],"prompt":[77],"engineering.":[78],"Our":[79],"findings":[80],"suggest":[81],"not":[86],"robustly":[87],"distributed":[88],"model":[90],"weights":[91],"but":[92],"mediated":[94],"by":[95],"individual":[96],"are":[99],"causally":[101],"sufficient":[102],"behavior":[106],"suppressing":[108],"one":[110],"identified":[113],"bypasses":[116],"diverse":[120],"requests.":[122]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
