{"id":"https://openalex.org/W4416034915","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.450","title":"SafeInt: Shielding Large Language Models from Jailbreak Attacks via Safety-Aware Representation Intervention","display_name":"SafeInt: Shielding Large Language Models from Jailbreak Attacks via Safety-Aware Representation Intervention","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416034915","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.450"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.450","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.450","pdf_url":"https://aclanthology.org/2025.findings-emnlp.450.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-emnlp.450.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079190032","display_name":"Jiaqi Wu","orcid":"https://orcid.org/0000-0002-3299-7410"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiaqi Wu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100418568","display_name":"Chen Chen","orcid":"https://orcid.org/0000-0003-3957-7061"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101032887","display_name":"Chunyan Hou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chunyan Hou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5062064974","display_name":"Xiaojie Yuan","orcid":"https://orcid.org/0000-0002-5876-6856"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaojie Yuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":5.2763,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.95691677,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"8473","last_page":"8488"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8925999999046326,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8925999999046326,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.008500000461935997,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.00839999970048666,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5952000021934509},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.423799991607666},{"id":"https://openalex.org/keywords/intervention","display_name":"Intervention (counseling)","score":0.3440000116825104},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.32690000534057617},{"id":"https://openalex.org/keywords/modeling-language","display_name":"Modeling language","score":0.3262999951839447}],"concepts":[{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5952000021934509},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5622000098228455},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.423799991607666},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4163999855518341},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4081000089645386},{"id":"https://openalex.org/C2780665704","wikidata":"https://www.wikidata.org/wiki/Q959298","display_name":"Intervention (counseling)","level":2,"score":0.3440000116825104},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C179603123","wikidata":"https://www.wikidata.org/wiki/Q1941921","display_name":"Modeling language","level":3,"score":0.3262999951839447},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2980000078678131},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2648000121116638},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.2549000084400177},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.findings-emnlp.450","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.450","pdf_url":"https://aclanthology.org/2025.findings-emnlp.450.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.450","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.450","pdf_url":"https://aclanthology.org/2025.findings-emnlp.450.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1776413336","display_name":null,"funder_award_id":"62172237","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4034941058","display_name":null,"funder_award_id":"62077031","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4667360632","display_name":null,"funder_award_id":"62372252","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4787383871","display_name":null,"funder_award_id":"62176028","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5811738602","display_name":null,"funder_award_id":"U1936206","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7560528120","display_name":null,"funder_award_id":"U1936105","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320321106","display_name":"Ministry of Education of the People's Republic of China","ror":"https://ror.org/01mv9t934"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416034915.pdf","grobid_xml":"https://content.openalex.org/works/W4416034915.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"With":[0],"the":[1,62,65,92,97,108,116],"widespread":[2],"real-world":[3],"deployment":[4],"of":[5,64,91,94,100,119,127],"large":[6],"language":[7],"models":[8],"(LLMs),":[9],"ensuring":[10],"their":[11],"behavior":[12],"complies":[13],"with":[14,125],"safety":[15],"standards":[16],"has":[17],"become":[18],"crucial.Jailbreak":[19],"attacks":[20,83,156,166],"exploit":[21],"vulnerabilities":[22],"in":[23,115,151,171],"LLMs":[24,80,153],"to":[25,33,39,103,122],"induce":[26],"undesirable":[27],"behavior,":[28],"posing":[29],"a":[30,47,74],"significant":[31],"threat":[32],"LLM":[34],"safety.Previous":[35],"defenses":[36],"often":[37],"fail":[38],"achieve":[40],"both":[41],"effectiveness":[42,170],"and":[43,140,167,183],"efficiency":[44],"simultaneously.Defenses":[45],"from":[46,81],"representation":[48,86,117],"perspective":[49],"offer":[50],"new":[51],"insights,":[52],"but":[53],"existing":[54],"interventions":[55],"cannot":[56],"dynamically":[57],"adjust":[58],"representations":[59,93,106],"based":[60],"on":[61,88],"harmfulness":[63],"queries.To":[66],"address":[67],"this":[68],"limitation,":[69],"we":[70,161],"propose":[71],"SafeIntervention":[72],"(SafeInt),":[73],"novel":[75],"defense":[76],"method":[77],"that":[78,146,180],"shields":[79],"jailbreak":[82,95,120,135,138,155],"through":[84],"safety-aware":[85],"intervention.Built":[87],"our":[89],"analysis":[90],"samples,":[96],"core":[98],"idea":[99],"SafeInt":[101,147,163],"is":[102,111,181],"relocate":[104],"jailbreak-related":[105],"into":[107],"rejection":[109],"region.This":[110],"achieved":[112],"by":[113],"intervening":[114],"distributions":[118],"samples":[121],"align":[123],"them":[124],"those":[126],"unsafe":[128],"samples.We":[129],"conduct":[130],"comprehensive":[131],"experiments":[132],"covering":[133],"six":[134],"attacks,":[136],"two":[137,141],"datasets,":[139],"utility":[142],"benchmarks.Experimental":[143],"results":[144],"demonstrate":[145],"outperforms":[148],"all":[149],"baselines":[150],"defending":[152],"against":[154,164],"while":[157],"largely":[158],"maintaining":[159],"utility.Additionally,":[160],"evaluate":[162],"adaptive":[165],"verify":[168],"its":[169],"mitigating":[172],"real-time":[173],"attacks.WARNING:":[174],"This":[175],"paper":[176],"may":[177],"contain":[178],"content":[179],"offensive":[182],"harmful.":[184]},"counts_by_year":[{"year":2026,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-08T00:00:00"}
