{"id":"https://openalex.org/W7127910947","doi":"https://doi.org/10.48550/arxiv.2602.04521","title":"$C$-$\u0394\u0398$: Circuit-Restricted Weight Arithmetic for Selective Refusal","display_name":"$C$-$\u0394\u0398$: Circuit-Restricted Weight Arithmetic for Selective Refusal","publication_year":2026,"publication_date":"2026-02-04","ids":{"openalex":"https://openalex.org/W7127910947","doi":"https://doi.org/10.48550/arxiv.2602.04521"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.04521","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052686099","display_name":"Aditya Kasliwal","orcid":"https://orcid.org/0000-0001-5547-3778"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kasliwal, Aditya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125243122","display_name":"Pratinav Seth","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seth, Pratinav","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125158402","display_name":"Vinay Kumar Sankarapu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sankarapu, Vinay Kumar","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5052686099"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10142","display_name":"Formal Methods in Verification","score":0.4171999990940094,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10142","display_name":"Formal Methods in Verification","score":0.4171999990940094,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.18279999494552612,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.14300000667572021,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4927000105381012},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.4487000107765198},{"id":"https://openalex.org/keywords/ask-price","display_name":"Ask price","score":0.3549000024795532},{"id":"https://openalex.org/keywords/psychological-intervention","display_name":"Psychological intervention","score":0.3075999915599823},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.30709999799728394},{"id":"https://openalex.org/keywords/electronic-circuit","display_name":"Electronic circuit","score":0.2994999885559082}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6202999949455261},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.5224999785423279},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4927000105381012},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.4487000107765198},{"id":"https://openalex.org/C90329073","wikidata":"https://www.wikidata.org/wiki/Q914232","display_name":"Ask price","level":2,"score":0.3549000024795532},{"id":"https://openalex.org/C27415008","wikidata":"https://www.wikidata.org/wiki/Q7256382","display_name":"Psychological intervention","level":2,"score":0.3075999915599823},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.30709999799728394},{"id":"https://openalex.org/C134146338","wikidata":"https://www.wikidata.org/wiki/Q1815901","display_name":"Electronic circuit","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C194544171","wikidata":"https://www.wikidata.org/wiki/Q21105679","display_name":"Gating","level":2,"score":0.2976999878883362},{"id":"https://openalex.org/C182306322","wikidata":"https://www.wikidata.org/wiki/Q1779371","display_name":"Order (exchange)","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.29580000042915344},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.27889999747276306},{"id":"https://openalex.org/C2780665704","wikidata":"https://www.wikidata.org/wiki/Q959298","display_name":"Intervention (counseling)","level":2,"score":0.2782999873161316},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2614000141620636},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.2606000006198883}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.04521","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.04521","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.04521","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.04521","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"deployments":[1],"require":[2],"LLMs":[3],"to":[4,141],"enforce":[5],"safety":[6],"policies":[7],"at":[8],"scale,":[9],"yet":[10],"many":[11],"controls":[12],"rely":[13],"on":[14,118,153],"inference-time":[15,57,134],"interventions":[16],"that":[17,84,119],"add":[18],"recurring":[19],"compute":[20],"cost":[21,37,137],"and":[22,35,108,150,155],"serving":[23],"complexity.":[24],"Activation":[25],"steering":[26,50],"is":[27,51],"widely":[28],"used,":[29],"but":[30,53],"it":[31],"requires":[32],"runtime":[33],"hooks":[34],"scales":[36],"with":[38,132],"the":[39],"number":[40],"of":[41,74,123],"generations;":[42],"conditional":[43],"variants":[44],"improve":[45],"selectivity":[46,149],"by":[47],"gating":[48],"when":[49],"applied":[52],"still":[54],"retain":[55],"an":[56],"control":[58],"path.":[59],"We":[60,90,146],"ask":[61],"whether":[62],"selective":[63],"refusal":[64,76,154],"can":[65,70],"be":[66,77],"moved":[67],"entirely":[68],"offline:":[69],"a":[71,80,87,103,111,128,142],"mechanistic":[72],"understanding":[73],"category-specific":[75],"distilled":[78],"into":[79],"circuit-restricted":[81],"weight":[82,113],"update":[83,114],"deploys":[85],"as":[86,102],"standard":[88],"checkpoint?":[89],"propose":[91],"C-\u0394\u03b8:":[92],"Circuit":[93],"Restricted":[94],"Weight":[95],"Arithmetic,":[96],"which":[97],"(i)":[98],"localizes":[99],"refusal-causal":[100],"computation":[101],"sparse":[104],"circuit":[105,120],"using":[106],"EAP-IG":[107],"(ii)":[109],"computes":[110],"constrained":[112],"\u0394\u03b8C":[115,126],"supported":[116],"only":[117],"(typically":[121],"&lt;5%":[122],"parameters).":[124],"Applying":[125],"yields":[127],"drop-in":[129],"edited":[130],"checkpoint":[131],"no":[133],"hooks,":[135],"shifting":[136],"from":[138],"per-request":[139],"intervention":[140],"one-time":[143],"offline":[144],"update.":[145],"evaluate":[147],"category-targeted":[148],"capability":[151],"retention":[152],"utility":[156],"benchmarks.":[157]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-07T00:00:00"}
