{"id":"https://openalex.org/W7148494129","doi":"https://doi.org/10.48550/arxiv.2604.00324","title":"The Persistent Vulnerability of Aligned AI Systems","display_name":"The Persistent Vulnerability of Aligned AI Systems","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7148494129","doi":"https://doi.org/10.48550/arxiv.2604.00324"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00324","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00324","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00324","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132809681","display_name":"Aengus Lynch","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lynch, Aengus","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5132809681"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.569599986076355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.569599986076355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.04769999906420708,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.029500000178813934,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.680400013923645},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.5913000106811523},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5008000135421753},{"id":"https://openalex.org/keywords/vulnerability","display_name":"Vulnerability (computing)","score":0.42890000343322754},{"id":"https://openalex.org/keywords/adversarial-machine-learning","display_name":"Adversarial machine learning","score":0.4251999855041504},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.35179999470710754},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.33570000529289246}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.680400013923645},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6693999767303467},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.5913000106811523},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5457000136375427},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5008000135421753},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4562999904155731},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.42890000343322754},{"id":"https://openalex.org/C2778403875","wikidata":"https://www.wikidata.org/wiki/Q20312394","display_name":"Adversarial machine learning","level":3,"score":0.4251999855041504},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.3919000029563904},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.35179999470710754},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.33570000529289246},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.2964000105857849},{"id":"https://openalex.org/C33499554","wikidata":"https://www.wikidata.org/wiki/Q1417134","display_name":"Dashboard","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C13687954","wikidata":"https://www.wikidata.org/wiki/Q4826847","display_name":"Autonomous agent","level":2,"score":0.2827000021934509},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2791000008583069},{"id":"https://openalex.org/C2780615836","wikidata":"https://www.wikidata.org/wiki/Q2471869","display_name":"USable","level":2,"score":0.27549999952316284},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.27230000495910645},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.27160000801086426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00324","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00324","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00324","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00324","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8200564384460449,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Autonomous":[0],"AI":[1,22],"agents":[2,169],"are":[3],"being":[4],"deployed":[5],"with":[6,113],"filesystem":[7],"access,":[8],"email":[9],"control,":[10],"and":[11,38,126,145,179,213],"multi-step":[12],"planning.":[13],"This":[14],"thesis":[15,200],"contributes":[16],"to":[17,90,188],"four":[18],"open":[19],"problems":[20,208],"in":[21,50,71,86,171],"safety:":[23],"understanding":[24],"dangerous":[25,29,81],"internal":[26],"computations,":[27],"removing":[28],"behaviors":[30,82],"once":[31],"embedded,":[32],"testing":[33],"for":[34,174],"vulnerabilities":[35],"before":[36],"deployment,":[37],"predicting":[39],"when":[40,190],"models":[41,158,191],"will":[42],"act":[43],"against":[44],"deployers.":[45],"ACDC":[46],"automates":[47],"circuit":[48],"discovery":[49],"transformers,":[51],"recovering":[52],"all":[53],"five":[54],"component":[55],"types":[56],"from":[57,68,186],"prior":[58],"manual":[59],"work":[60],"on":[61,124,128],"GPT-2":[62],"Small":[63],"by":[64,83],"selecting":[65],"68":[66],"edges":[67],"32,000":[69],"candidates":[70],"hours":[72],"rather":[73,196],"than":[74,197],"months.":[75],"Latent":[76],"Adversarial":[77],"Training":[78],"(LAT)":[79],"removes":[80],"optimizing":[84],"perturbations":[85],"the":[87,101],"residual":[88],"stream":[89],"elicit":[91],"failure":[92],"modes,":[93],"then":[94],"training":[95,108],"under":[96],"those":[97],"perturbations.":[98],"LAT":[99],"solved":[100],"sleeper":[102],"agent":[103],"problem":[104],"where":[105],"standard":[106],"safety":[107],"failed,":[109],"matching":[110],"existing":[111],"defenses":[112],"700x":[114],"fewer":[115],"GPU":[116],"hours.":[117],"Best-of-N":[118],"jailbreaking":[119],"achieves":[120],"89%":[121],"attack":[122],"success":[123,137],"GPT-4o":[125],"78%":[127],"Claude":[129,175],"3.5":[130],"Sonnet":[131],"through":[132],"random":[133],"input":[134],"augmentations.":[135],"Attack":[136],"follows":[138],"power":[139],"law":[140],"scaling":[141],"across":[142],"text,":[143],"vision,":[144],"audio,":[146],"enabling":[147],"quantitative":[148],"forecasting":[149],"of":[150,206],"adversarial":[151],"robustness.":[152],"Agentic":[153],"misalignment":[154],"tests":[155],"whether":[156],"frontier":[157],"autonomously":[159],"choose":[160],"harmful":[161],"actions":[162,180],"given":[163],"ordinary":[164],"goals.":[165],"Across":[166],"16":[167],"models,":[168],"engaged":[170],"blackmail":[172],"(96%":[173],"Opus":[176],"4),":[177],"espionage,":[178],"causing":[181],"death.":[182],"Misbehavior":[183],"rates":[184],"rose":[185],"6.5%":[187],"55.1%":[189],"stated":[192],"scenarios":[193],"were":[194],"real":[195],"evaluations.":[198],"The":[199],"does":[201],"not":[202],"fully":[203],"resolve":[204],"any":[205],"these":[207],"but":[209],"makes":[210],"each":[211],"tractable":[212],"measurable.":[214]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
