{"id":"https://openalex.org/W4416988084","doi":"https://doi.org/10.3233/faia251581","title":"Which Neurons Nudge Normative Stance? Causal Tests and Mechanistic Evidence via Contrastive Last-Token Steering","display_name":"Which Neurons Nudge Normative Stance? Causal Tests and Mechanistic Evidence via Contrastive Last-Token Steering","publication_year":2025,"publication_date":"2025-12-02","ids":{"openalex":"https://openalex.org/W4416988084","doi":"https://doi.org/10.3233/faia251581"},"language":"en","primary_location":{"id":"doi:10.3233/faia251581","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia251581","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.3233/faia251581","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089424168","display_name":"Davide Liga","orcid":"https://orcid.org/0000-0003-1124-0299"},"institutions":[{"id":"https://openalex.org/I186903577","display_name":"University of Luxembourg","ror":"https://ror.org/036x5ad56","country_code":"LU","type":"education","lineage":["https://openalex.org/I186903577"]}],"countries":["LU"],"is_corresponding":true,"raw_author_name":"Davide Liga","raw_affiliation_strings":["University of Luxembourg"],"raw_orcid":"https://orcid.org/0000-0003-1124-0299","affiliations":[{"raw_affiliation_string":"University of Luxembourg","institution_ids":["https://openalex.org/I186903577"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5065409273","display_name":"Liuwen Yu","orcid":"https://orcid.org/0000-0002-7200-6001"},"institutions":[{"id":"https://openalex.org/I186903577","display_name":"University of Luxembourg","ror":"https://ror.org/036x5ad56","country_code":"LU","type":"education","lineage":["https://openalex.org/I186903577"]}],"countries":["LU"],"is_corresponding":false,"raw_author_name":"Liuwen Yu","raw_affiliation_strings":["University of Luxembourg"],"raw_orcid":"https://orcid.org/0000-0002-7200-6001","affiliations":[{"raw_affiliation_string":"University of Luxembourg","institution_ids":["https://openalex.org/I186903577"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5089424168"],"corresponding_institution_ids":["https://openalex.org/I186903577"],"apc_list":null,"apc_paid":null,"fwci":4.1459,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.95408983,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.4578000009059906,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.4578000009059906,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1031000018119812,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.05959999933838844,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/normative","display_name":"Normative","score":0.8167999982833862},{"id":"https://openalex.org/keywords/contrast","display_name":"Contrast (vision)","score":0.5209000110626221},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5001999735832214},{"id":"https://openalex.org/keywords/permissive","display_name":"Permissive","score":0.43790000677108765},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.4156000018119812},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4043000042438507},{"id":"https://openalex.org/keywords/logit","display_name":"Logit","score":0.37959998846054077}],"concepts":[{"id":"https://openalex.org/C44725695","wikidata":"https://www.wikidata.org/wiki/Q288156","display_name":"Normative","level":2,"score":0.8167999982833862},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.5927000045776367},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.5209000110626221},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5001999735832214},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.44519999623298645},{"id":"https://openalex.org/C15224491","wikidata":"https://www.wikidata.org/wiki/Q7169338","display_name":"Permissive","level":2,"score":0.43790000677108765},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.4156000018119812},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4043000042438507},{"id":"https://openalex.org/C140331021","wikidata":"https://www.wikidata.org/wiki/Q1868104","display_name":"Logit","level":2,"score":0.37959998846054077},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.34139999747276306},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.34119999408721924},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.30399999022483826},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.29249998927116394},{"id":"https://openalex.org/C158600405","wikidata":"https://www.wikidata.org/wiki/Q5054566","display_name":"Causal inference","level":2,"score":0.2897000014781952},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.28540000319480896},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.26989999413490295},{"id":"https://openalex.org/C137549413","wikidata":"https://www.wikidata.org/wiki/Q7053127","display_name":"Normative model of decision-making","level":3,"score":0.2687000036239624},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2554999887943268},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.25519999861717224}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.3233/faia251581","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia251581","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},{"id":"pmh:oai:orbilu.uni.lu:10993/66856","is_oa":true,"landing_page_url":"https://orbilu.uni.lu/handle/10993/66856","pdf_url":null,"source":{"id":"https://openalex.org/S4306401815","display_name":"Open Repository and Bibliography (University of Luxembourg)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I186903577","host_organization_name":"University of Luxembourg","host_organization_lineage":["https://openalex.org/I186903577"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"Frontiers in Artificial Intelligence and Applications, 110 - 120 (2025-12-02)","raw_type":"peer reviewed"}],"best_oa_location":{"id":"doi:10.3233/faia251581","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia251581","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Normative":[0],"stance":[1,84,116],"underlies":[2],"decisions":[3],"in":[4,50,109,126,162],"law,":[5],"legal":[6],"reasoning,":[7],"policy,":[8],"and":[9,108,133,153,201,212,224],"safety-critical":[10],"settings.":[11],"A":[12],"model\u2019s":[13,33,115],"judgment":[14],"of":[15,73],"what":[16],"is":[17,121,218],"permissible":[18],"vs.":[19,118],"impermissible":[20],"often":[21],"determines":[22],"its":[23,68],"downstream":[24],"behavior.":[25],"We":[26,166],"study":[27],"how":[28,106],"to":[29,45,71],"steer":[30],"a":[31,41,63,74,80,100,127,141],"language":[32],"normative":[34,59,83],"stances":[35],"at":[36,96],"inference":[37],"time":[38],"by":[39,66,170,190,198],"adding":[40],"tiny,":[42],"contrastive":[43,195],"perturbation":[44],"the":[46,97,114,124,158,163,193,210],"last-token":[47,55,69,199],"neural":[48],"activation":[49,70],"late":[51],"MLP":[52],"layers":[53],"(contrastive":[54],"steering).":[56],"For":[57],"each":[58],"prompt,":[60],"we":[61,92,112,139,186,225],"construct":[62],"contrast":[64],"direction":[65,111],"comparing":[67],"that":[72,78],"minimally":[75],"edited":[76],"variant":[77],"implies":[79],"more":[81],"permissive":[82,132],"(e.g.,":[85],"\u201cacceptable\u201d":[86],"rather":[87],"than":[88],"\u201cwrong\u201d).":[89],"During":[90],"generation,":[91],"add":[93],"this":[94],"vector":[95],"last":[98],"token;":[99],"single":[101],"strength":[102],"parameter":[103],"\u03b1":[104],"controls":[105,146],"strongly":[107],"which":[110],"push":[113],"(permissive":[117],"restrictive).":[119],"Impact":[120],"measured":[122],"as":[123],"change":[125],"next-token":[128],"logit":[129],"margin":[130],"between":[131],"restrictive":[134],"continuations.":[135],"To":[136],"avoid":[137],"overclaiming,":[138],"calibrate":[140],"threshold":[142],"\u03c4":[143,161],"on":[144,173,204,229],"neutral":[145,174],"(same":[147],"layers,":[148],"tempered":[149],"strengths":[150],"with":[151],"|\u03b1|\u22641)":[152],"count":[154],"success":[155],"only":[156,192],"when":[157],"shift":[159,211],"exceeds":[160],"expected":[164],"direction.":[165],"also":[167],"assess":[168],"specificity":[169],"verifying":[171],"that,":[172],"control":[175],"prompts,":[176],"steered":[177],"outputs":[178],"exactly":[179],"match":[180],"unsteered":[181],"baselines.":[182],"Beyond":[183],"component-level":[184],"tests,":[185],"probe":[187],"neuron-level":[188],"locality":[189],"steering":[191],"top-k":[194],"neurons":[196],"(ranked":[197],"contrast)":[200],"confirming":[202],"reversibility":[203],"our":[205],"test":[206],"set:":[207],"+\u03b1":[208],"produces":[209],"-\u03b1":[213],"reverses":[214],"it.":[215],"The":[216],"method":[217],"training-free,":[219],"uses":[220],"standard":[221],"forward":[222],"hooks,":[223],"report":[226],"pilot":[227],"results":[228],"Llama-3-8B-Instruct.":[230]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-05T23:21:25.405358","created_date":"2025-12-04T00:00:00"}
