{"id":"https://openalex.org/W7160385718","doi":"https://doi.org/10.48550/arxiv.2605.03095","title":"Revisiting JBShield: Breaking and Rebuilding Representation-Level Jailbreak Defenses","display_name":"Revisiting JBShield: Breaking and Rebuilding Representation-Level Jailbreak Defenses","publication_year":2026,"publication_date":"2026-05-04","ids":{"openalex":"https://openalex.org/W7160385718","doi":"https://doi.org/10.48550/arxiv.2605.03095"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.03095","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.03095","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.03095","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5099111593","display_name":"Kemal Derya","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Derya, Kemal","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066592325","display_name":"Berk Sunar","orcid":"https://orcid.org/0000-0001-5404-5368"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sunar, Berk","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.982200026512146,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.982200026512146,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.002400000113993883,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.001500000013038516,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mahalanobis-distance","display_name":"Mahalanobis distance","score":0.6144000291824341},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5620999932289124},{"id":"https://openalex.org/keywords/vulnerability","display_name":"Vulnerability (computing)","score":0.4821000099182129},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.3853999972343445},{"id":"https://openalex.org/keywords/unobservable","display_name":"Unobservable","score":0.3806999921798706},{"id":"https://openalex.org/keywords/outlier","display_name":"Outlier","score":0.35530000925064087},{"id":"https://openalex.org/keywords/intrusion-detection-system","display_name":"Intrusion detection system","score":0.3528999984264374}],"concepts":[{"id":"https://openalex.org/C1921717","wikidata":"https://www.wikidata.org/wiki/Q1334846","display_name":"Mahalanobis distance","level":2,"score":0.6144000291824341},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6136000156402588},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5620999932289124},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.4821000099182129},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.4352000057697296},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3853999972343445},{"id":"https://openalex.org/C2780695315","wikidata":"https://www.wikidata.org/wiki/Q3799040","display_name":"Unobservable","level":2,"score":0.3806999921798706},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35830000042915344},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.35530000925064087},{"id":"https://openalex.org/C35525427","wikidata":"https://www.wikidata.org/wiki/Q745881","display_name":"Intrusion detection system","level":2,"score":0.3528999984264374},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.33970001339912415},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.3271999955177307},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.31769999861717224},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2750999927520752},{"id":"https://openalex.org/C184297639","wikidata":"https://www.wikidata.org/wiki/Q177765","display_name":"Biometrics","level":2,"score":0.2667999863624573},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.26440000534057617},{"id":"https://openalex.org/C2780505938","wikidata":"https://www.wikidata.org/wiki/Q17093282","display_name":"Unavailability","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.03095","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.03095","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.03095","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.03095","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.8220435976982117}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Defending":[0],"large":[1],"language":[2],"models":[3,21],"(LLMs)":[4],"against":[5,118,195,207],"jailbreak":[6,33,56,150,254],"attacks,":[7],"such":[8],"as":[9],"Greedy":[10],"Coordinate":[11],"Gradient":[12],"(GCG),":[13],"remains":[14,116],"a":[15,31,36,51,55,157,177,249],"challenge,":[16],"particularly":[17],"under":[18,239],"adaptive":[19,205,240],"threat":[20,241],"where":[22],"an":[23,97,191,203],"attacker":[24],"directly":[25],"targets":[26],"the":[27,75,107,137,146,214,216,225],"defense":[28,34,179],"mechanism.":[29],"JBShield,":[30],"recent":[32],"with":[35,209],"0%":[37],"attack":[38,115,129,206,218],"success":[39],"rate":[40],"in":[41,106,160],"some":[42],"settings,":[43],"detects":[44],"malicious":[45],"prompts":[46,169],"via":[47,71,84],"two":[48,67],"concept":[49,53,88,258],"signals,":[50],"toxic":[52,87],"and":[54,78,81,152,201,243],"concept.":[57],"We":[58,110,144,171],"design":[59,200],"JB-GCG,":[60],"which":[61],"modifies":[62],"GCG's":[63],"objective":[64],"to":[65,104,123],"combine":[66],"terms:":[68],"refusal-direction":[69,161,187],"suppression":[70],"cosine":[72],"similarity":[73],"between":[74],"refusal":[76],"direction":[77],"hidden-state":[79],"representations,":[80],"toxic-concept":[82],"regularization":[83],"JBShield's":[85],"own":[86],"score.":[89],"Across":[90],"five":[91],"configurations":[92],"on":[93,181],"Llama-3-8B,":[94],"JB-GCG":[95],"achieves":[96,219],"average":[98],"ASR":[99,121,222],"of":[100,149,193,213],"46.2%,":[101],"reaching":[102],"up":[103,122],"53.4%":[105],"strongest":[108],"setting.":[109],"further":[111],"show":[112,230],"that":[113,136,154,164,231,244],"our":[114,196],"effective":[117],"JBShield-M,":[119],"achieving":[120],"30.7%":[124],"across":[125,131],"evaluated":[126],"settings.":[127],"The":[128],"persists":[130],"multiple":[132],"JBShield":[133],"recalibrations,":[134],"confirming":[135],"vulnerability":[138],"is":[139,248],"structural":[140],"rather":[141],"than":[142,256],"calibration-specific.":[143],"analyze":[145],"cosine-similarity":[147],"signatures":[148],"representations":[151],"find":[153],"they":[155],"occupy":[156],"distinctive":[158],"region":[159],"fingerprint":[162],"space":[163],"neither":[165],"harmless":[166],"nor":[167],"harmful":[168],"inhabit.":[170],"introduce":[172],"Representation":[173],"Trajectory":[174],"Verification":[175],"(RTV),":[176],"new":[178],"based":[180],"Mahalanobis":[182],"outlier":[183],"detection":[184,234,255],"over":[185],"multi-layer":[186,245],"fingerprints.":[188],"RTV":[189,208],"attains":[190],"AUROC":[192],"0.99":[194],"attack.":[197],"Finally,":[198],"we":[199],"evaluate":[202],"additional":[204],"full":[210],"white-box":[211],"knowledge":[212],"defense;":[215],"best":[217],"only":[220],"7%":[221],"at":[223],"13x":[224],"computational":[226],"cost.":[227],"Our":[228],"results":[229],"strong":[232],"non-adaptive":[233],"does":[235],"not":[236],"imply":[237],"robustness":[238],"models,":[242],"representation":[246],"consistency":[247],"more":[250],"reliable":[251],"foundation":[252],"for":[253],"single-layer":[257],"similarity.":[259]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-07T00:00:00"}
