{"id":"https://openalex.org/W4414360535","doi":"https://doi.org/10.24963/ijcai.2025/56","title":"Feint and Attack: Jailbreaking and Protecting LLMs via Attention Distribution Modeling","display_name":"Feint and Attack: Jailbreaking and Protecting LLMs via Attention Distribution Modeling","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https://openalex.org/W4414360535","doi":"https://doi.org/10.24963/ijcai.2025/56"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2025/56","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/56","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5086268387","display_name":"Rui Pu","orcid":"https://orcid.org/0000-0003-3366-739X"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Rui Pu","raw_affiliation_strings":["Beijing University of Posts and Telecommunications"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063693227","display_name":"Chaozhuo Li","orcid":"https://orcid.org/0000-0002-8179-7503"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chaozhuo Li","raw_affiliation_strings":["Beijing University of Posts and Telecommunications"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114601765","display_name":"Rui Ha","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Ha","raw_affiliation_strings":["Beijing University of Posts and Telecommunications"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101859702","display_name":"Zejian Chen","orcid":"https://orcid.org/0000-0001-9453-051X"},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zejian Chen","raw_affiliation_strings":["Hangzhou Dianzi University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hangzhou Dianzi University","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101599254","display_name":"Litian Zhang","orcid":"https://orcid.org/0000-0002-6981-3873"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Litian Zhang","raw_affiliation_strings":["Beijing University of Posts and Telecommunications"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100744285","display_name":"Zheng Liu","orcid":"https://orcid.org/0000-0001-6526-059X"},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zheng Liu","raw_affiliation_strings":["Beijing Academy of Artificial Intelligence"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing Academy of Artificial Intelligence","institution_ids":["https://openalex.org/I4210100255"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024600603","display_name":"Lirong Qiu","orcid":"https://orcid.org/0000-0001-6489-1648"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lirong Qiu","raw_affiliation_strings":["Beijing University of Posts and Telecommunications"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067742605","display_name":"Zaisheng Ye","orcid":"https://orcid.org/0000-0001-9881-6400"},"institutions":[{"id":"https://openalex.org/I4210148548","display_name":"Fujian Provincial Cancer Hospital","ror":"https://ror.org/058ms9w43","country_code":"CN","type":"healthcare","lineage":["https://openalex.org/I4210148548"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zaisheng Ye","raw_affiliation_strings":["Fujian Cancer Hospital"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Fujian Cancer Hospital","institution_ids":["https://openalex.org/I4210148548"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5086268387"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.12238415,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"493","last_page":"501"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13851","display_name":"Law, AI, and Intellectual Property","score":0.9445000290870667,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13851","display_name":"Law, AI, and Intellectual Property","score":0.9445000290870667,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.9171000123023987,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.7160000205039978},{"id":"https://openalex.org/keywords/phenomenon","display_name":"Phenomenon","score":0.6147000193595886},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5672000050544739},{"id":"https://openalex.org/keywords/distribution","display_name":"Distribution (mathematics)","score":0.39160001277923584}],"concepts":[{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.7160000205039978},{"id":"https://openalex.org/C50335755","wikidata":"https://www.wikidata.org/wiki/Q483247","display_name":"Phenomenon","level":2,"score":0.6147000193595886},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5672000050544739},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5566999912261963},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.4844000041484833},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.4431000053882599},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3944999873638153},{"id":"https://openalex.org/C110121322","wikidata":"https://www.wikidata.org/wiki/Q865811","display_name":"Distribution (mathematics)","level":2,"score":0.39160001277923584},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2759999930858612},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2025/56","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/56","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Most":[0],"jailbreak":[1,69,125],"methods":[2],"for":[3,49],"large":[4],"language":[5],"models":[6],"(LLMs)":[7],"focus":[8],"on":[9,77,99],"superficially":[10],"improving":[11],"attack":[12,33],"success":[13,66],"through":[14],"manually":[15],"defined":[16],"rules.":[17],"However,":[18],"they":[19],"fail":[20],"to":[21,73,106,133,143,161],"uncover":[22],"the":[23,43,52,58,65,74,115,130,154],"underlying":[24],"mechanisms":[25],"within":[26,57,94],"target":[27],"LLMs":[28,50],"that":[29,64],"explain":[30],"why":[31],"an":[32,123,137],"succeeds":[34],"or":[35],"fails.":[36],"In":[37],"this":[38,81],"paper,":[39],"we":[40,84,121],"propose":[41,85],"investigating":[42],"phenomenon":[44],"of":[45,54,67,117,156],"jailbreaks":[46],"and":[47,102,110,119,136],"defenses":[48],"from":[51,90,114],"perspective":[53],"attention":[55,76,92,132,149],"distributions":[56,93],"models.":[59],"A":[60],"preliminary":[61],"experiment":[62],"reveals":[63],"a":[68],"is":[70],"closely":[71],"linked":[72],"LLM's":[75],"sensitive":[78],"words.Inspired":[79],"by":[80,146],"interesting":[82],"finding,":[83],"incorporating":[86],"critical":[87],"signals":[88],"derived":[89],"internal":[91,148],"LLMs,":[95],"namely":[96],"Attention":[97,103],"Intensity":[98],"Sensitive":[100],"Words":[101],"Dispersion":[104],"Entropy,":[105],"guide":[107],"both":[108],"attacks":[109,145],"defenses.":[111],"Drawing":[112],"inspiration":[113],"concept":[116],"\"Feint":[118],"Attack\",":[120],"introduce":[122],"attention-guided":[124],"model,":[126,140],"ABA,":[127],"which":[128],"redirects":[129],"model's":[131],"benign":[134],"contexts,":[135],"attention-based":[138],"defense":[139],"ABD,":[141],"designed":[142],"detect":[144],"analyzing":[147],"entropy.":[150],"Experimental":[151],"results":[152],"demonstrate":[153],"superiority":[155],"our":[157],"proposal":[158],"when":[159],"compared":[160],"SOTA":[162],"baselines.":[163]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
