{"id":"https://openalex.org/W7138040166","doi":"https://doi.org/10.1609/aaai.v40i5.37350","title":"EigenShield: Inference-Time, Model-Agnostic Jailbreaking Defense via Causal Subspace Filtering","display_name":"EigenShield: Inference-Time, Model-Agnostic Jailbreaking Defense via Causal Subspace Filtering","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138040166","doi":"https://doi.org/10.1609/aaai.v40i5.37350"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i5.37350","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i5.37350","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37350/41312","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37350/41312","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5087116179","display_name":"Nastaran Darabi","orcid":"https://orcid.org/0000-0003-2163-4267"},"institutions":[{"id":"https://openalex.org/I39422238","display_name":"University of Illinois Chicago","ror":"https://ror.org/02mpq6x41","country_code":"US","type":"education","lineage":["https://openalex.org/I39422238"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Nastaran Darabi","raw_affiliation_strings":["University of Illinois at Chicago"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois at Chicago","institution_ids":["https://openalex.org/I39422238"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022144754","display_name":"Devashri Naik","orcid":null},"institutions":[{"id":"https://openalex.org/I39422238","display_name":"University of Illinois Chicago","ror":"https://ror.org/02mpq6x41","country_code":"US","type":"education","lineage":["https://openalex.org/I39422238"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Devashri Naik","raw_affiliation_strings":["University of Illinois at Chicago"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois at Chicago","institution_ids":["https://openalex.org/I39422238"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092418505","display_name":"Sina Tayebati","orcid":"https://orcid.org/0009-0002-4322-5210"},"institutions":[{"id":"https://openalex.org/I39422238","display_name":"University of Illinois Chicago","ror":"https://ror.org/02mpq6x41","country_code":"US","type":"education","lineage":["https://openalex.org/I39422238"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sina Tayebati","raw_affiliation_strings":["University of Illinois at Chicago"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois at Chicago","institution_ids":["https://openalex.org/I39422238"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013620938","display_name":"Dinithi Jayasuriya","orcid":"https://orcid.org/0009-0007-9229-3590"},"institutions":[{"id":"https://openalex.org/I39422238","display_name":"University of Illinois Chicago","ror":"https://ror.org/02mpq6x41","country_code":"US","type":"education","lineage":["https://openalex.org/I39422238"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dinithi Jayasuriya","raw_affiliation_strings":["University of Illinois at Chicago"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois at Chicago","institution_ids":["https://openalex.org/I39422238"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043575556","display_name":"Ranganath Krishnan","orcid":null},"institutions":[{"id":"https://openalex.org/I1305444813","display_name":"Capital One (United States)","ror":"https://ror.org/00svp7168","country_code":"US","type":"company","lineage":["https://openalex.org/I1305444813"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ranganath Krishnan","raw_affiliation_strings":["Capital One, AI Labs"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Capital One, AI Labs","institution_ids":["https://openalex.org/I1305444813"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5028132107","display_name":"Amit Ranjan Trivedi","orcid":"https://orcid.org/0000-0001-5436-7922"},"institutions":[{"id":"https://openalex.org/I39422238","display_name":"University of Illinois Chicago","ror":"https://ror.org/02mpq6x41","country_code":"US","type":"education","lineage":["https://openalex.org/I39422238"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Amit Ranjan Trivedi","raw_affiliation_strings":["University of Illinois at Chicago"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois at Chicago","institution_ids":["https://openalex.org/I39422238"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5087116179"],"corresponding_institution_ids":["https://openalex.org/I39422238"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.24753452,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"5","first_page":"3524","last_page":"3532"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9380000233650208,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9380000233650208,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.010599999688565731,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.007300000172108412,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.8141000270843506},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.51419997215271},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.48579999804496765},{"id":"https://openalex.org/keywords/subspace-topology","display_name":"Subspace topology","score":0.47130000591278076},{"id":"https://openalex.org/keywords/quantile","display_name":"Quantile","score":0.4108999967575073},{"id":"https://openalex.org/keywords/covariance-matrix","display_name":"Covariance matrix","score":0.38909998536109924},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.3880999982357025}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.8141000270843506},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.609499990940094},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.51419997215271},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.48579999804496765},{"id":"https://openalex.org/C32834561","wikidata":"https://www.wikidata.org/wiki/Q660730","display_name":"Subspace topology","level":2,"score":0.47130000591278076},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4397999942302704},{"id":"https://openalex.org/C118671147","wikidata":"https://www.wikidata.org/wiki/Q578714","display_name":"Quantile","level":2,"score":0.4108999967575073},{"id":"https://openalex.org/C185142706","wikidata":"https://www.wikidata.org/wiki/Q1134404","display_name":"Covariance matrix","level":2,"score":0.38909998536109924},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.3880999982357025},{"id":"https://openalex.org/C2986577269","wikidata":"https://www.wikidata.org/wiki/Q11306265","display_name":"Random noise","level":2,"score":0.3801000118255615},{"id":"https://openalex.org/C140547941","wikidata":"https://www.wikidata.org/wiki/Q7797194","display_name":"Threat model","level":2,"score":0.35659998655319214},{"id":"https://openalex.org/C178650346","wikidata":"https://www.wikidata.org/wiki/Q201984","display_name":"Covariance","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C194541083","wikidata":"https://www.wikidata.org/wiki/Q457174","display_name":"Workaround","level":2,"score":0.2973000109195709},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2962000072002411},{"id":"https://openalex.org/C122770356","wikidata":"https://www.wikidata.org/wiki/Q1656753","display_name":"Identifiability","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2906000018119812},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.29019999504089355}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i5.37350","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i5.37350","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37350/41312","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i5.37350","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i5.37350","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37350/41312","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.4427315294742584,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[{"id":"https://openalex.org/G1452086972","display_name":null,"funder_award_id":"JUMP 2.0","funder_id":"https://openalex.org/F4320306087","funder_display_name":"Semiconductor Research Corporation"},{"id":"https://openalex.org/G8657564338","display_name":null,"funder_award_id":"2046435","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306087","display_name":"Semiconductor Research Corporation","ror":"https://ror.org/047z4n946"},{"id":"https://openalex.org/F4320332180","display_name":"Defense Advanced Research Projects Agency","ror":"https://ror.org/02caytj08"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138040166.pdf","grobid_xml":"https://content.openalex.org/works/W7138040166.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2,6],"(LLMs)":[3],"and":[4,29,59,70,95,101,129,187],"Vision-Language":[5],"(VLMs)":[7],"remain":[8],"highly":[9],"vulnerable":[10],"to":[11,47,68,119,141,144,157,170,183],"adversarial":[12,49,76,82,125,149],"attacks":[13,161],"despite":[14],"widespread":[15],"adoption.":[16],"Existing":[17],"defenses":[18],"typically":[19],"require":[20],"retraining,":[21],"rely":[22],"on":[23,162],"heuristics,":[24],"or":[25,81],"fail":[26],"under":[27],"adaptive":[28,160],"out-of-distribution":[30],"(OOD)":[31],"conditions.":[32],"We":[33,84],"introduce":[34],"EigenShield,":[35],"a":[36,60,86],"principled,":[37],"inference-time,":[38],"architecture-agnostic":[39],"defense":[40],"that":[41],"leverages":[42],"Random":[43],"Matrix":[44],"Theory":[45],"(RMT)":[46],"suppress":[48],"noise":[50,93],"in":[51,98],"high-dimensional":[52],"embeddings.":[53],"EigenShield":[54,105,135,175],"uses":[55],"spiked":[56],"covariance":[57],"modeling":[58],"Robustness-based":[61],"Nonconformity":[62],"Score":[63],"(RbNS)":[64],"with":[65],"quantile":[66],"thresholding":[67],"isolate":[69],"preserve":[71],"causal":[72],"eigenvectors,":[73],"filtering":[74],"out":[75],"components":[77],"without":[78],"model":[79],"access":[80],"training.":[83],"develop":[85],"theoretical":[87],"framework":[88],"establishing":[89],"conditions":[90],"for":[91,185,189],"asymptotic":[92],"suppression":[94],"demonstrate":[96],"effectiveness":[97],"both":[99],"unimodal":[100],"multimodal":[102,148],"settings.":[103],"Empirically,":[104],"consistently":[106],"improves":[107],"robustness":[108],"across":[109],"threat":[110],"models,":[111],"reducing":[112,179],"attack":[113],"success":[114],"rates":[115],"(ASR)":[116],"by":[117,139,155,181],"up":[118,140,156,169,182],"48%":[120],"over":[121],"state-of-the-art":[122],"defenses,":[123],"including":[124],"training,":[126],"UNIGUARD,":[127],"CIDER,":[128],"input":[130],"transformations.":[131],"On":[132],"jailbreak":[133],"attacks,":[134,150],"lowers":[136],"LLM":[137],"ASR":[138,154,166,180],"92.9%":[142],"relative":[143],"undefended":[145],"models.":[146],"Under":[147],"it":[151,164],"reduces":[152],"VLM":[153],"76.5%.":[158],"Against":[159],"LLMs,":[163],"achieves":[165],"reductions":[167],"of":[168],"77.7%.":[171],"In":[172],"OOD":[173],"settings,":[174],"maintains":[176],"strong":[177],"performance,":[178],"88.4%":[184],"LLMs":[186],"80.4%":[188],"VLMs.":[190]},"counts_by_year":[],"updated_date":"2026-05-01T08:36:08.643496","created_date":"2026-03-18T00:00:00"}
