{"id":"https://openalex.org/W4416036560","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.480","title":"Token-Aware Editing of Internal Activations for Large Language Model Alignment","display_name":"Token-Aware Editing of Internal Activations for Large Language Model Alignment","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416036560","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.480"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.480","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.480","pdf_url":"https://aclanthology.org/2025.emnlp-main.480.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.480.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101521052","display_name":"Tianbo Wang","orcid":"https://orcid.org/0000-0003-0765-1693"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Tianbo Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035168341","display_name":"Yuqing Ma","orcid":"https://orcid.org/0000-0001-8009-4358"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuqing Ma","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Kewei Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kewei Liao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071021087","display_name":"Chayu Yang","orcid":"https://orcid.org/0000-0002-6916-5626"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chengzhao Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120309097","display_name":"Zhange Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhange Zhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031116553","display_name":"Jiakai Wang","orcid":"https://orcid.org/0000-0001-5884-3412"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiakai Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101562121","display_name":"Xianglong Liu","orcid":"https://orcid.org/0009-0000-9205-8135"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xianglong Liu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5101521052"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.17325642,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"9482","last_page":"9520"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.33219999074935913,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.33219999074935913,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.24220000207424164,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.06800000369548798,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3467000126838684},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.2955000102519989},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.2856000065803528},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.2685999870300293}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6920999884605408},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47769999504089355},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.44040000438690186},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3467000126838684},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2955000102519989},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2793000042438507},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.25450000166893005},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.24650000035762787}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.480","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.480","pdf_url":"https://aclanthology.org/2025.emnlp-main.480.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.480","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.480","pdf_url":"https://aclanthology.org/2025.emnlp-main.480.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416036560.pdf","grobid_xml":"https://content.openalex.org/works/W4416036560.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Intervening":[0],"the":[1,37,69,91,111,122,139,149,164],"internal":[2],"activations":[3],"of":[4,32,125,141,152,166],"large":[5],"language":[6],"models":[7],"(LLMs)":[8],"provides":[9],"an":[10,86],"effective":[11],"inference-time":[12],"alignment":[13,46,66,100,131,136],"approach":[14,61],"to":[15,62,89,120],"mitigate":[16],"undesirable":[17],"behaviors,":[18],"such":[19],"as":[20],"generating":[21],"erroneous":[22],"or":[23],"harmful":[24],"content,":[25],"thereby":[26,128],"ensuring":[27],"safe":[28],"and":[29,48,102,118],"reliable":[30],"applications":[31],"LLMs.However,":[33],"previous":[34],"methods":[35],"neglect":[36],"misalignment":[38,113],"discrepancy":[39],"among":[40],"varied":[41],"tokens,":[42],"resulting":[43],"in":[44,68],"deviant":[45],"direction":[47],"inflexible":[49],"editing":[50,59,126],"strength.To":[51],"address":[52],"these":[53],"issues,":[54],"we":[55],"propose":[56],"a":[57,77],"token-aware":[58],"(TAE)":[60],"fully":[63],"utilize":[64],"token-level":[65,112],"information":[67],"activation":[70,96],"space,":[71],"therefore":[72],"realizing":[73],"superior":[74],"post-intervention":[75],"performance.Specifically,":[76],"Mutual":[78],"Information-guided":[79],"Graph":[80],"Aggregation":[81],"(MIG)":[82],"module":[83],"first":[84],"develops":[85],"MI-guided":[87],"graph":[88],"exploit":[90],"tokens'":[92],"informative":[93],"interaction":[94],"for":[95],"enrichment,":[97],"thus":[98],"improving":[99],"probing":[101],"facilitating":[103],"intervention.Subsequently,":[104],"Misalignment-aware":[105],"Adaptive":[106],"Intervention":[107],"(MAI)":[108],"comprehensively":[109],"perceives":[110],"degree":[114],"from":[115],"token":[116],"representation":[117],"prediction":[119],"guide":[121],"adaptive":[123],"adjustment":[124],"strength,":[127],"enhancing":[129],"final":[130],"performance.Extensive":[132],"experiments":[133],"on":[134,148],"three":[135],"capabilities":[137],"demonstrate":[138],"efficacy":[140],"TAE,":[142],"notably":[143],"surpassing":[144],"baseline":[145],"by":[146],"25.8%":[147],"primary":[150],"metric":[151],"truthfulness":[153],"with":[154],"minimal":[155],"cost.":[156],"1":[157],"MHSA":[158],"FFN":[159],"+":[160],"+Q:":[161],"What":[162],"is":[163],"capital":[165],"UK?":[167]},"counts_by_year":[],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-11-08T00:00:00"}
