{"id":"https://openalex.org/W7155497779","doi":"https://doi.org/10.48550/arxiv.2604.20945","title":"Breaking Bad: Interpretability-Based Safety Audits of State-of-the-Art LLMs","display_name":"Breaking Bad: Interpretability-Based Safety Audits of State-of-the-Art LLMs","publication_year":2026,"publication_date":"2026-04-22","ids":{"openalex":"https://openalex.org/W7155497779","doi":"https://doi.org/10.48550/arxiv.2604.20945"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.20945","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20945","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.20945","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5116971398","display_name":"Krishiv Agarwal","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Agarwal, Krishiv","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134490594","display_name":"Ramneet Kaur","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaur, Ramneet","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011915615","display_name":"Colin Samplawski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Samplawski, Colin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019134968","display_name":"Manoj Acharya","orcid":"https://orcid.org/0000-0003-0223-3556"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Acharya, Manoj","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134487806","display_name":"Anirban Roy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roy, Anirban","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007525901","display_name":"Daniel Elenius","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Elenius, Daniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084050272","display_name":"Brian Matejek","orcid":"https://orcid.org/0000-0002-3517-9229"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Matejek, Brian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134543649","display_name":"Adam D. Cobb","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cobb, Adam D.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5035902535","display_name":"Susmit Jha","orcid":"https://orcid.org/0000-0001-5983-9095"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jha, Susmit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5116971398"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8532000184059143,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8532000184059143,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.07999999821186066,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.00989999994635582,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.9178000092506409},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.777400016784668},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6190999746322632},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5059000253677368},{"id":"https://openalex.org/keywords/risk-assessment","display_name":"Risk assessment","score":0.3296999931335449},{"id":"https://openalex.org/keywords/grid","display_name":"Grid","score":0.32030001282691956}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.9178000092506409},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.777400016784668},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6190999746322632},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5533000230789185},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.5406000018119812},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5059000253677368},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.34549999237060547},{"id":"https://openalex.org/C12174686","wikidata":"https://www.wikidata.org/wiki/Q1058438","display_name":"Risk assessment","level":2,"score":0.3296999931335449},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.3280999958515167},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.32030001282691956},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3133000135421753},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C32896092","wikidata":"https://www.wikidata.org/wiki/Q189447","display_name":"Risk management","level":2,"score":0.27000001072883606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.20945","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20945","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.20945","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20945","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.7991994023323059}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Effective":[0],"safety":[1,160],"auditing":[2],"of":[3,30,81],"large":[4],"language":[5],"models":[6,98,127],"(LLMs)":[7],"demands":[8],"tools":[9],"that":[10],"go":[11],"beyond":[12],"black-box":[13],"probing":[14],"and":[15,43,52,84,107,125,135,168],"systematically":[16],"uncover":[17],"vulnerabilities":[18],"rooted":[19],"in":[20,93,175],"model":[21,94],"internals.":[22],"We":[23],"present":[24],"a":[25,78,85,155],"comprehensive,":[26],"interpretability-driven":[27],"jailbreaking":[28,140],"audit":[29],"eight":[31],"SOTA":[32],"open-source":[33],"LLMs:":[34],"Llama-3.1-8B,":[35],"Llama-3.3-70B-4bt,":[36,113],"GPT-oss-":[37],"20B,":[38],"GPT-oss-120B,":[39],"Qwen3-0.6B,":[40],"Qwen3-32B,":[41],"Phi4-3.8B,":[42],"Phi4-14B.":[44],"Leveraging":[45],"interpretability-based":[46,152],"approaches":[47],"--":[48,56],"Universal":[49],"Steering":[50],"(US)":[51,106],"Representation":[53],"Engineering":[54],"(RepE)":[55,109],"we":[57],"introduce":[58],"an":[59],"adaptive":[60],"two-stage":[61],"grid":[62],"search":[63],"algorithm":[64],"to":[65,104,118],"identify":[66],"optimal":[67],"activation-steering":[68],"coefficients":[69],"for":[70,158,171],"unsafe":[71],"behavioral":[72],"concepts.":[73],"Our":[74,149],"evaluation,":[75],"conducted":[76],"on":[77,112],"curated":[79],"set":[80],"harmful":[82],"queries":[83],"standardized":[86],"LLM-based":[87],"judging":[88],"protocol,":[89],"reveals":[90],"stark":[91],"contrasts":[92],"robustness.":[95],"The":[96],"Llama-3":[97],"are":[99,146],"highly":[100],"vulnerable,":[101],"with":[102,131],"up":[103],"91\\%":[105],"83\\%":[108],"jailbroken":[110],"responses":[111],"while":[114,142],"GPT-oss-120B":[115],"remains":[116],"robust":[117],"attacks":[119],"via":[120],"both":[121],"interpretability":[122],"approaches.":[123],"Qwen":[124],"Phi":[126],"show":[128],"mixed":[129],"results,":[130],"the":[132,169],"smaller":[133],"Qwen3-0.6B":[134],"Phi4-3.8B":[136],"mostly":[137],"exhibiting":[138],"lower":[139],"rates,":[141],"their":[143],"larger":[144],"counterparts":[145],"more":[147],"susceptible.":[148],"results":[150],"establish":[151],"steering":[153],"as":[154],"powerful":[156],"tool":[157],"systematic":[159],"audits,":[161],"but":[162],"also":[163],"highlight":[164],"its":[165],"dual-use":[166],"risks":[167],"need":[170],"better":[172],"internal":[173],"defenses":[174],"LLM":[176],"deployment.":[177]},"counts_by_year":[],"updated_date":"2026-05-04T08:30:34.212998","created_date":"2026-04-25T00:00:00"}
