{"id":"https://openalex.org/W7140323808","doi":"https://doi.org/10.48550/arxiv.2603.23268","title":"SafeSeek: Universal Attribution of Safety Circuits in Language Models","display_name":"SafeSeek: Universal Attribution of Safety Circuits in Language Models","publication_year":2026,"publication_date":"2026-03-24","ids":{"openalex":"https://openalex.org/W7140323808","doi":"https://doi.org/10.48550/arxiv.2603.23268"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.23268","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23268","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.23268","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130544574","display_name":"Miao Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Miao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122381331","display_name":"Siyuan Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Siyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020975697","display_name":"Moayad Aloqaily","orcid":"https://orcid.org/0000-0003-2443-7234"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aloqaily, Moayad","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130592595","display_name":"Zhenhong Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zhenhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067986202","display_name":"Safa Otoum","orcid":"https://orcid.org/0000-0002-0814-7328"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Otoum, Safa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130589677","display_name":"Xing fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"fan, Xing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130614524","display_name":"Kun Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Kun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025897338","display_name":"Yufei Guo","orcid":"https://orcid.org/0000-0001-7002-1710"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Yufei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130567410","display_name":"Qingsong Wen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen, Qingsong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9348000288009644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9348000288009644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.01810000091791153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.009200000204145908,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/backdoor","display_name":"Backdoor","score":0.8719000220298767},{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8305000066757202},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5835999846458435},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5094000101089478},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.42170000076293945},{"id":"https://openalex.org/keywords/electronic-circuit","display_name":"Electronic circuit","score":0.3377000093460083}],"concepts":[{"id":"https://openalex.org/C2781045450","wikidata":"https://www.wikidata.org/wiki/Q254569","display_name":"Backdoor","level":2,"score":0.8719000220298767},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8305000066757202},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6191999912261963},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5835999846458435},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5094000101089478},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.42170000076293945},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.4156999886035919},{"id":"https://openalex.org/C134146338","wikidata":"https://www.wikidata.org/wiki/Q1815901","display_name":"Electronic circuit","level":2,"score":0.3377000093460083},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.33640000224113464},{"id":"https://openalex.org/C2780505938","wikidata":"https://www.wikidata.org/wiki/Q17093282","display_name":"Unavailability","level":2,"score":0.3100999891757965},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.29179999232292175},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C3017944768","wikidata":"https://www.wikidata.org/wiki/Q1450463","display_name":"Poison control","level":2,"score":0.28029999136924744},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.2759000062942505},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2655999958515167},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.26109999418258667},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.25949999690055847}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.23268","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23268","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.23268","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23268","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Mechanistic":[0],"interpretability":[1,51],"reveals":[2],"that":[3,53],"safety-critical":[4],"behaviors":[5],"(e.g.,":[6],"alignment,":[7],"jailbreak,":[8],"backdoor)":[9],"in":[10,17,59,103,107],"Large":[11],"Language":[12],"Models":[13],"(LLMs)":[14],"are":[15],"grounded":[16],"specialized":[18],"functional":[19],"components.":[20],"However,":[21],"existing":[22],"safety":[23,50,57,84,98,139,168],"attribution":[24],"methods":[25,64],"struggle":[26],"with":[27,117,145],"generalization":[28],"and":[29,39,148],"reliability":[30],"due":[31],"to":[32,76,91],"their":[33],"reliance":[34],"on":[35,66,83],"heuristic,":[36],"domain-specific":[37],"metrics":[38],"search":[40],"algorithms.":[41],"To":[42],"address":[43],"this,":[44],"we":[45],"propose":[46],"\\ourmethod,":[47],"a":[48,114],"unified":[49],"framework":[52],"identifies":[54],"functionally":[55],"complete":[56],"circuits":[58,79,95],"LLMs":[60],"via":[61],"optimization.":[62],"Unlike":[63],"focusing":[65],"isolated":[67],"heads":[68,147],"or":[69],"neurons,":[70,150],"\\ourmethod":[71,102],"introduces":[72],"differentiable":[73],"binary":[74],"masks":[75],"extract":[77],"multi-granular":[78],"through":[80],"gradient":[81],"descent":[82],"datasets,":[85],"while":[86,132],"integrates":[87],"Safety":[88],"Circuit":[89],"Tuning":[90],"utilize":[92],"these":[93],"sparse":[94],"for":[96],"efficient":[97],"fine-tuning.":[99],"We":[100],"validate":[101],"two":[104],"key":[105],"scenarios":[106],"LLM":[108],"safety:":[109],"\\textbf{(1)":[110],"backdoor":[111,115],"attacks},":[112],"identifying":[113],"circuit":[116,144,162],"0.42\\%":[118],"sparsity,":[119],"whose":[120,151],"ablation":[121],"eradicates":[122],"the":[123],"Attack":[124],"Success":[125],"Rate":[126],"(ASR)":[127],"from":[128,155],"100\\%":[129],"$\\to$":[130,157],"0.4\\%":[131],"retaining":[133],"over":[134],"99\\%":[135],"general":[136],"utility;":[137],"\\textbf{(2)":[138],"alignment},":[140],"localizing":[141],"an":[142],"alignment":[143],"3.03\\%":[146],"0.79\\%":[149],"removal":[152],"spikes":[153],"ASR":[154],"0.8\\%":[156],"96.9\\%,":[158],"whereas":[159],"excluding":[160],"this":[161],"during":[163],"helpfulness":[164],"fine-tuning":[165],"maintains":[166],"96.5\\%":[167],"retention.":[169]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-26T00:00:00"}
