{"id":"https://openalex.org/W7127306783","doi":"https://doi.org/10.48550/arxiv.2602.02027","title":"Light Alignment Improves LLM Safety via Model Self-Reflection with a Single Neuron","display_name":"Light Alignment Improves LLM Safety via Model Self-Reflection with a Single Neuron","publication_year":2026,"publication_date":"2026-02-02","ids":{"openalex":"https://openalex.org/W7127306783","doi":"https://doi.org/10.48550/arxiv.2602.02027"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.02027","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124891545","display_name":"Sicheng Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shen, Sicheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124896489","display_name":"Mingyang Lv","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lv, Mingyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124908124","display_name":"Han Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Han","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124932765","display_name":"Jialin Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Jialin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111000689","display_name":"Binghao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Binghao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124915862","display_name":"Zhou Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081220735","display_name":"Guobin Shen","orcid":"https://orcid.org/0000-0002-4069-2107"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Guobin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124883821","display_name":"Dongcheng Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Dongcheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101355519","display_name":"Feifei Zhao","orcid":"https://orcid.org/0009-0006-8286-679X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Feifei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124958854","display_name":"Yi Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Yi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5124891545"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.593500018119812,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.593500018119812,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.13189999759197235,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.038100000470876694,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.6025000214576721},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.5830000042915344},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5630000233650208},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.4975999891757965},{"id":"https://openalex.org/keywords/usability","display_name":"Usability","score":0.44760000705718994},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.44679999351501465}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7681000232696533},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6025000214576721},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.5830000042915344},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5630000233650208},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.4975999891757965},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.44760000705718994},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.44679999351501465},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.429500013589859},{"id":"https://openalex.org/C194544171","wikidata":"https://www.wikidata.org/wiki/Q21105679","display_name":"Gating","level":2,"score":0.3231000006198883},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3133000135421753},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.29760000109672546},{"id":"https://openalex.org/C186565885","wikidata":"https://www.wikidata.org/wiki/Q1651163","display_name":"Biological neuron model","level":3,"score":0.2964000105857849},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2705000042915344},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26010000705718994},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2567000091075897},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2556999921798706}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.02027","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.02027","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.02027","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.02027","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"safety":[1,18,53],"of":[2,14,44,88,145],"large":[3,146],"language":[4,147],"models":[5],"(LLMs)":[6],"has":[7],"increasingly":[8],"emerged":[9],"as":[10,97],"a":[11,79,94,98,133],"fundamental":[12],"aspect":[13],"their":[15],"development.":[16],"Existing":[17],"alignment":[19,46,138],"for":[20,139],"LLMs":[21],"is":[22],"predominantly":[23],"achieved":[24],"through":[25],"post-training":[26],"methods,":[27],"which":[28],"are":[29],"computationally":[30],"expensive":[31],"and":[32,67,70,92,116,127,142],"often":[33],"fail":[34],"to":[35],"generalize":[36],"well":[37],"across":[38,129],"different":[39],"models.":[40,148],"A":[41],"small":[42],"number":[43],"lightweight":[45,137],"approaches":[47],"either":[48],"rely":[49],"heavily":[50],"on":[51,58,136],"prior-computed":[52],"injections":[54],"or":[55],"depend":[56],"excessively":[57],"the":[59,104,140],"model's":[60,105],"own":[61],"capabilities,":[62],"resulting":[63],"in":[64,124],"limited":[65],"generalization":[66,128],"degraded":[68],"efficiency":[69],"usability":[71],"during":[72],"generation.":[73],"In":[74],"this":[75],"work,":[76],"we":[77],"propose":[78],"safety-aware":[80],"decoding":[81],"method":[82],"that":[83],"requires":[84],"only":[85],"low-cost":[86],"training":[87,125],"an":[89],"expert":[90],"model":[91,130],"employs":[93],"single":[95],"neuron":[96],"gating":[99],"mechanism.":[100],"By":[101],"effectively":[102],"balancing":[103],"intrinsic":[106],"capabilities":[107],"with":[108],"external":[109],"guidance,":[110],"our":[111],"approach":[112],"simultaneously":[113],"preserves":[114],"utility":[115],"enhances":[117],"output":[118],"safety.":[119],"It":[120],"demonstrates":[121],"clear":[122],"advantages":[123],"overhead":[126],"scales,":[131],"offering":[132],"new":[134],"perspective":[135],"safe":[141],"practical":[143],"deployment":[144],"Code:":[149],"https://github.com/Beijing-AISI/NGSD.":[150]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-04T00:00:00"}
