{"id":"https://openalex.org/W7128004664","doi":"https://doi.org/10.48550/arxiv.2602.04893","title":"A Causal Perspective for Enhancing Jailbreak Attack and Defense","display_name":"A Causal Perspective for Enhancing Jailbreak Attack and Defense","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7128004664","doi":"https://doi.org/10.48550/arxiv.2602.04893"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.04893","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124278084","display_name":"Licheng Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Pan, Licheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125149730","display_name":"Yunsheng Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Yunsheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081394772","display_name":"Jiexi Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jiexi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125234283","display_name":"Jialing Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Jialing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125244656","display_name":"Haozhe Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Haozhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125255270","display_name":"Hui Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Hui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5097357262","display_name":"Zhixuan Chu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chu, Zhixuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5105297718","display_name":"Kui Ren","orcid":"https://orcid.org/0000-0002-1969-2591"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Kui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5124278084"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5271000266075134,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5271000266075134,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.0908999964594841,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11147","display_name":"Misinformation and Its Impacts","score":0.05730000138282776,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/causal-model","display_name":"Causal model","score":0.6036999821662903},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5979999899864197},{"id":"https://openalex.org/keywords/causal-chain","display_name":"Causal chain","score":0.5702999830245972},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.5254999995231628},{"id":"https://openalex.org/keywords/causal-structure","display_name":"Causal structure","score":0.5090000033378601},{"id":"https://openalex.org/keywords/causal-inference","display_name":"Causal inference","score":0.49939998984336853},{"id":"https://openalex.org/keywords/causal-analysis","display_name":"Causal analysis","score":0.4871000051498413},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.445499986410141}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6053000092506409},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.6036999821662903},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5979999899864197},{"id":"https://openalex.org/C79897977","wikidata":"https://www.wikidata.org/wiki/Q5054568","display_name":"Causal chain","level":2,"score":0.5702999830245972},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.5254999995231628},{"id":"https://openalex.org/C163504300","wikidata":"https://www.wikidata.org/wiki/Q2364925","display_name":"Causal structure","level":2,"score":0.5090000033378601},{"id":"https://openalex.org/C158600405","wikidata":"https://www.wikidata.org/wiki/Q5054566","display_name":"Causal inference","level":2,"score":0.49939998984336853},{"id":"https://openalex.org/C2987525970","wikidata":"https://www.wikidata.org/wiki/Q96374569","display_name":"Causal analysis","level":2,"score":0.4871000051498413},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.445499986410141},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.4300999939441681},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35589998960494995},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.35370001196861267},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3434000015258789},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3328999876976013},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3271999955177307},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.32519999146461487},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.31200000643730164},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2989000082015991},{"id":"https://openalex.org/C64357122","wikidata":"https://www.wikidata.org/wiki/Q1149766","display_name":"Causality (physics)","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.27160000801086426},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C166151441","wikidata":"https://www.wikidata.org/wiki/Q4923601","display_name":"Causation","level":2,"score":0.2515999972820282}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.04893","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.04893","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.04893","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.04893","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.7763164639472961,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Uncovering":[0],"the":[1,36,64,152,186,208],"mechanisms":[2,20],"behind":[3],"\"jailbreaks\"":[4],"in":[5],"large":[6],"language":[7],"models":[8],"(LLMs)":[9],"is":[10,231,243],"crucial":[11],"for":[12,72,237],"enhancing":[13],"their":[14],"safety":[15],"and":[16,43,69,75,95,113,138,179,203,214,234],"reliability,":[17],"yet":[18],"these":[19,156],"remain":[21],"poorly":[22],"understood.":[23],"Existing":[24],"studies":[25],"predominantly":[26],"analyze":[27],"jailbreak":[28,44,84,126,225],"prompts":[29],"by":[30],"probing":[31],"latent":[32],"representations,":[33],"often":[34],"overlooking":[35],"causal":[37,60,115,120,146,168,188,204,212,229],"relationships":[38],"between":[39],"interpretable":[40,235],"prompt":[41,105,111,123],"features":[42,124,169,226],"occurrences.":[45],"In":[46],"this":[47],"work,":[48],"we":[49,118],"propose":[50],"Causal":[51],"Analyst,":[52],"a":[53,79,162,181,228],"framework":[54],"that":[55,131,165,184,223],"integrates":[56],"LLMs":[57],"into":[58],"data-driven":[59],"discovery":[61],"to":[62,125,170,190],"identify":[63],"direct":[65,145],"causes":[66],"of":[67,140,148,155,210],"jailbreaks":[68],"leverage":[70],"them":[71],"both":[73],"attack":[74,93,173],"defense.":[76],"We":[77,150],"introduce":[78],"comprehensive":[80],"dataset":[81],"comprising":[82],"35k":[83],"attempts":[85],"across":[86],"seven":[87],"LLMs,":[88],"systematically":[89],"constructed":[90],"from":[91,195,227],"100":[92],"templates":[94],"50":[96],"harmful":[97],"queries,":[98],"annotated":[99],"with":[100],"37":[101],"meticulously":[102],"designed":[103],"human-readable":[104],"features.":[106],"By":[107],"jointly":[108],"training":[109],"LLM-based":[110],"encoding":[112],"GNN-based":[114],"graph":[116,189],"learning,":[117],"reconstruct":[119],"pathways":[121],"linking":[122],"responses.":[127],"Our":[128,220,241],"analysis":[129,213],"reveals":[130],"specific":[132],"features,":[133],"such":[134],"as":[135,144],"\"Positive":[136],"Character\"":[137],"\"Number":[139],"Task":[141],"Steps\",":[142],"act":[143],"drivers":[147],"jailbreaks.":[149],"demonstrate":[151],"practical":[153],"utility":[154],"insights":[157],"through":[158],"two":[159],"applications:":[160],"(1)":[161],"Jailbreaking":[163],"Enhancer":[164],"targets":[166],"identified":[167],"significantly":[171],"boost":[172],"success":[174],"rates":[175],"on":[176],"public":[177],"benchmarks,":[178],"(2)":[180],"Guardrail":[182],"Advisor":[183],"utilizes":[185],"learned":[187],"extract":[191],"true":[192],"malicious":[193],"intent":[194],"obfuscated":[196],"queries.":[197],"Extensive":[198],"experiments,":[199],"including":[200],"baseline":[201],"comparisons":[202],"structure":[205],"validation,":[206],"confirm":[207],"robustness":[209],"our":[211],"its":[215],"superiority":[216],"over":[217],"non-causal":[218],"approaches.":[219],"results":[221],"suggest":[222],"analyzing":[224],"perspective":[230],"an":[232],"effective":[233],"approach":[236],"improving":[238],"LLM":[239],"reliability.":[240],"code":[242],"available":[244],"at":[245],"https://github.com/Master-PLC/Causal-Analyst.":[246]},"counts_by_year":[],"updated_date":"2026-04-29T09:16:38.111599","created_date":"2026-02-07T00:00:00"}
