{"id":"https://openalex.org/W7125970969","doi":"https://doi.org/10.48550/arxiv.2601.19375","title":"Selective Steering: Norm-Preserving Control Through Discriminative Layer Selection","display_name":"Selective Steering: Norm-Preserving Control Through Discriminative Layer Selection","publication_year":2026,"publication_date":"2026-01-27","ids":{"openalex":"https://openalex.org/W7125970969","doi":"https://doi.org/10.48550/arxiv.2601.19375"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.19375","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124057528","display_name":"Quy-Anh Dang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dang, Quy-Anh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5012291455","display_name":"Chris Ngo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ngo, Chris","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8744999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8744999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0421999990940094,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.010900000110268593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7364000082015991},{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.6161999702453613},{"id":"https://openalex.org/keywords/binary-number","display_name":"Binary number","score":0.47350001335144043},{"id":"https://openalex.org/keywords/norm","display_name":"Norm (philosophy)","score":0.46480000019073486},{"id":"https://openalex.org/keywords/feature-selection","display_name":"Feature selection","score":0.40700000524520874},{"id":"https://openalex.org/keywords/control-theory","display_name":"Control theory (sociology)","score":0.3898000121116638},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.3752000033855438},{"id":"https://openalex.org/keywords/layer","display_name":"Layer (electronics)","score":0.3479999899864197}],"concepts":[{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7364000082015991},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6739000082015991},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.6161999702453613},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.47350001335144043},{"id":"https://openalex.org/C191795146","wikidata":"https://www.wikidata.org/wiki/Q3878446","display_name":"Norm (philosophy)","level":2,"score":0.46480000019073486},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4124000072479248},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.40700000524520874},{"id":"https://openalex.org/C47446073","wikidata":"https://www.wikidata.org/wiki/Q5165890","display_name":"Control theory (sociology)","level":3,"score":0.3898000121116638},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.3752000033855438},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.3479999899864197},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.3465000092983246},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3165999948978424},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.31060001254081726},{"id":"https://openalex.org/C74050887","wikidata":"https://www.wikidata.org/wiki/Q848368","display_name":"Rotation (mathematics)","level":2,"score":0.3075999915599823},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3010999858379364},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.29789999127388},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.2849000096321106},{"id":"https://openalex.org/C31531917","wikidata":"https://www.wikidata.org/wiki/Q915157","display_name":"Robust control","level":3,"score":0.27730000019073486},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2669999897480011},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2517000138759613},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.19375","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.19375","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.19375","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.19375","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.76144939661026,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Despite":[0],"significant":[1],"progress":[2],"in":[3,64,82],"alignment,":[4],"large":[5],"language":[6],"models":[7,83,130],"(LLMs)":[8],"remain":[9],"vulnerable":[10],"to":[11,43],"adversarial":[12],"attacks":[13],"that":[14,106,116,132],"elicit":[15],"harmful":[16],"behaviors.":[17],"Activation":[18],"steering":[19,118],"techniques":[20],"offer":[21],"a":[22,65,100,160],"promising":[23],"inference-time":[24],"intervention":[25],"approach,":[26],"but":[27,68],"existing":[28],"methods":[29,143],"suffer":[30],"from":[31],"critical":[32],"limitations:":[33],"activation":[34,108],"addition":[35],"requires":[36],"careful":[37],"coefficient":[38],"tuning":[39],"and":[40,78,111,149,166],"is":[41],"sensitive":[42],"layer-specific":[44],"norm":[45,73],"variations,":[46],"while":[47,144],"directional":[48],"ablation":[49],"provides":[50,159],"only":[51,119],"binary":[52],"control.":[53],"Recent":[54],"work":[55],"on":[56,154],"Angular":[57],"Steering":[58,134],"introduces":[59],"continuous":[60],"control":[61],"via":[62],"rotation":[63,104],"2D":[66],"subspace,":[67],"its":[69],"practical":[70],"implementation":[71],"violates":[72],"preservation,":[74],"causing":[75],"distribution":[76,109],"shift":[77],"generation":[79],"collapse,":[80],"particularly":[81],"below":[84],"7B":[85],"parameters.":[86],"We":[87],"propose":[88],"Selective":[89,133],"Steering,":[90],"which":[91],"addresses":[92],"these":[93],"limitations":[94],"through":[95],"two":[96],"key":[97],"innovations:":[98],"(1)":[99],"mathematically":[101],"rigorous":[102],"norm-preserving":[103],"formulation":[105],"maintains":[107],"integrity,":[110],"(2)":[112],"discriminative":[113],"layer":[114],"selection":[115],"applies":[117],"where":[120],"feature":[121],"representations":[122],"exhibit":[123],"opposite-signed":[124],"class":[125],"alignment.":[126],"Experiments":[127],"across":[128],"nine":[129],"demonstrate":[131],"achieves":[135],"5.5x":[136],"higher":[137],"attack":[138],"success":[139],"rates":[140],"than":[141],"prior":[142],"maintaining":[145],"zero":[146],"perplexity":[147],"violations":[148],"approximately":[150],"100\\%":[151],"capability":[152],"retention":[153],"standard":[155],"benchmarks.":[156],"Our":[157],"approach":[158],"principled,":[161],"efficient":[162],"framework":[163],"for":[164],"controllable":[165],"stable":[167],"LLM":[168],"behavior":[169],"modification.":[170],"Code:":[171],"https://github.com/knoveleng/steering":[172]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-29T00:00:00"}
