{"id":"https://openalex.org/W4410632703","doi":"https://doi.org/10.1145/3701716.3717659","title":"HSF: Defending against Jailbreak Attacks with Hidden State Filtering","display_name":"HSF: Defending against Jailbreak Attacks with Hidden State Filtering","publication_year":2025,"publication_date":"2025-05-08","ids":{"openalex":"https://openalex.org/W4410632703","doi":"https://doi.org/10.1145/3701716.3717659"},"language":"en","primary_location":{"id":"doi:10.1145/3701716.3717659","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3701716.3717659","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3717659","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3717659","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049565746","display_name":"Cheng Qian","orcid":"https://orcid.org/0009-0001-9494-5640"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Cheng Qian","raw_affiliation_strings":["School of AI, Beijing Advanced Innovation Center, Beihang University, BeiJing, China"],"affiliations":[{"raw_affiliation_string":"School of AI, Beijing Advanced Innovation Center, Beihang University, BeiJing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hainan Zhang","orcid":"https://orcid.org/0009-0006-9708-0220"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hainan Zhang","raw_affiliation_strings":["School of AI, Beijing Advanced Innovation Center, Beihang University, BeiJing, China"],"affiliations":[{"raw_affiliation_string":"School of AI, Beijing Advanced Innovation Center, Beihang University, BeiJing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079222154","display_name":"Lei Sha","orcid":"https://orcid.org/0000-0001-5914-7590"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Sha","raw_affiliation_strings":["Institute of Artificial Intelligence, Beihang University, BeiJing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Artificial Intelligence, Beihang University, BeiJing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101965096","display_name":"Zhiming Zheng","orcid":"https://orcid.org/0000-0002-2727-4445"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiming Zheng","raw_affiliation_strings":["School of AI, Beijing Advanced Innovation Center, Beihang University, BeiJing, China"],"affiliations":[{"raw_affiliation_string":"School of AI, Beijing Advanced Innovation Center, Beihang University, BeiJing, China","institution_ids":["https://openalex.org/I82880672"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5049565746"],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":6.1,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.95733114,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"2078","last_page":"2087"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12519","display_name":"Cybercrime and Law Enforcement Studies","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.657335638999939},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.5955389738082886},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.5165261626243591},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.17388251423835754}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.657335638999939},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.5955389738082886},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.5165261626243591},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.17388251423835754}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3701716.3717659","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3701716.3717659","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3717659","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3701716.3717659","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3701716.3717659","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3717659","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.6000000238418579,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[{"id":"https://openalex.org/G1231421488","display_name":null,"funder_award_id":"under","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2376276132","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7033253288","display_name":null,"funder_award_id":"Grants","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8951484681","display_name":null,"funder_award_id":"Grant","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4410632703.pdf","grobid_xml":"https://content.openalex.org/works/W4410632703.grobid-xml"},"referenced_works_count":4,"referenced_works":["https://openalex.org/W3035207248","https://openalex.org/W4309674289","https://openalex.org/W4389617257","https://openalex.org/W4401042853"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"With":[0],"the":[1,58,112,123,132,165,175,191,222],"growing":[2],"deployment":[3],"of":[4,61,225],"LLMs":[5],"in":[6,54],"daily":[7],"applications":[8],"like":[9],"chatbots":[10],"and":[11,22,68,104,136,170,240],"content":[12,25],"generation,":[13],"efforts":[14,43],"to":[15,37,57,167,232],"ensure":[16],"outputs":[17],"align":[18],"with":[19,236],"human":[20],"values":[21],"avoid":[23],"harmful":[24,105],"have":[26],"intensified.":[27],"However,":[28],"increasingly":[29],"sophisticated":[30],"jailbreak":[31,62,88,95,100,147,217,226],"attacks":[32,227],"threaten":[33],"this":[34,142],"alignment,":[35],"aiming":[36],"induce":[38],"unsafe":[39],"outputs.":[40],"Current":[41],"defense":[42,83,149,161,192,242],"either":[44],"focus":[45],"on":[46,65,152,200],"prompt":[47],"rewriting":[48],"or":[49,64],"detection,":[50,69],"which":[51,70],"are":[52,71],"limited":[53],"effectiveness":[55],"due":[56],"various":[59],"design":[60],"prompts,":[63],"output":[66],"control":[67],"computationally":[72],"expensive":[73],"as":[74,194],"they":[75],"require":[76],"LLM":[77,94],"inference.":[78],"Therefore,":[79],"designing":[80],"a":[81,146,153,158,195],"pre-inference":[82],"method":[84],"that":[85,99,120,163,209],"resists":[86],"diverse":[87],"prompts":[89],"is":[90],"crucial":[91],"for":[92,139],"preventing":[93],"attacks.":[96,218],"We":[97,179],"observe":[98],"attacks,":[101],"safe":[102],"queries,":[103,235],"queries":[106],"exhibit":[107],"different":[108,206],"clustering":[109],"patterns":[110],"within":[111],"LLM's":[113,124,133],"hidden":[114,125],"state":[115,126],"representation":[116],"space.":[117],"This":[118],"suggests":[119],"by":[121],"leveraging":[122],"representational":[127],"capabilities,":[128],"we":[129,144],"can":[130],"analyze":[131],"forthcoming":[134],"behavior":[135],"proactively":[137],"intervene":[138],"defense.":[140],"In":[141],"paper,":[143],"propose":[145],"attack":[148],"strategy":[150],"based":[151],"Hidden":[154],"State":[155],"Filter":[156],"(HSF),":[157],"lossless":[159],"architectural":[160],"mechanism":[162],"enables":[164],"model":[166],"preemptively":[168],"identify":[169],"reject":[171],"adversarial":[172],"inputs":[173],"before":[174],"inference":[176,238],"process":[177],"begins.":[178],"activate":[180],"its":[181],"defensive":[182],"potential":[183],"through":[184],"an":[185],"additional":[186],"plugin":[187],"module,":[188],"effectively":[189],"framing":[190],"task":[193],"classification":[196],"problem.":[197],"Experimental":[198],"results":[199],"two":[201],"benchmark":[202],"datasets,":[203],"utilizing":[204],"three":[205],"LLMs,":[207],"show":[208],"HSF":[210],"significantly":[211,220],"enhances":[212],"resilience":[213],"against":[214],"six":[215],"cutting-edge":[216],"It":[219],"reduces":[221],"success":[223],"rate":[224],"while":[228],"minimally":[229],"impacting":[230],"responses":[231],"benign":[233],"user":[234],"negligible":[237],"overhead,":[239],"outperforming":[241],"baselines.":[243]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2025-10-10T00:00:00"}
