{"id":"https://openalex.org/W7157609348","doi":"https://doi.org/10.48550/arxiv.2604.25891","title":"Conditional misalignment: common interventions can hide emergent misalignment behind contextual triggers","display_name":"Conditional misalignment: common interventions can hide emergent misalignment behind contextual triggers","publication_year":2026,"publication_date":"2026-04-28","ids":{"openalex":"https://openalex.org/W7157609348","doi":"https://doi.org/10.48550/arxiv.2604.25891"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.25891","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.25891","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.25891","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120429287","display_name":"Jan Dubi\u0144ski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dubi\u0144ski, Jan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134822699","display_name":"Jan Betley","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Betley, Jan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078055724","display_name":"Anna Sztyber","orcid":"https://orcid.org/0000-0002-6464-8194"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sztyber-Betley, Anna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134864218","display_name":"Daniel Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Daniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134868857","display_name":"Owain Evans","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Evans, Owain","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.4237000048160553,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.4237000048160553,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.14229999482631683,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.08749999850988388,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/psychological-intervention","display_name":"Psychological intervention","score":0.6690999865531921},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4715000092983246},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.46480000019073486},{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.4268999993801117},{"id":"https://openalex.org/keywords/intervention","display_name":"Intervention (counseling)","score":0.38370001316070557},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.3668000102043152}],"concepts":[{"id":"https://openalex.org/C27415008","wikidata":"https://www.wikidata.org/wiki/Q7256382","display_name":"Psychological intervention","level":2,"score":0.6690999865531921},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6654999852180481},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4715000092983246},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.46480000019073486},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.4268999993801117},{"id":"https://openalex.org/C2780665704","wikidata":"https://www.wikidata.org/wiki/Q959298","display_name":"Intervention (counseling)","level":2,"score":0.38370001316070557},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.3668000102043152},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34779998660087585},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32600000500679016},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.32510000467300415},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.32499998807907104},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.313400000333786},{"id":"https://openalex.org/C182306322","wikidata":"https://www.wikidata.org/wiki/Q1779371","display_name":"Order (exchange)","level":2,"score":0.29319998621940613},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2667999863624573}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.25891","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.25891","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.25891","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.25891","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Finetuning":[0],"a":[1,17,36,62,140,171],"language":[2],"model":[3,78,91],"can":[4],"lead":[5],"to":[6,24,41,72,152,174],"emergent":[7],"misalignment":[8,149,202],"(EM)":[9],"[Betley":[10],"et":[11],"al.,":[12],"2025b].":[13],"Models":[14],"trained":[15,138],"on":[16,53,104,125,139],"narrow":[18],"distribution":[19],"of":[20,38,142],"misaligned":[21,93,118,129,219,231],"behavior":[22],"generalize":[23],"more":[25,95],"egregious":[26,96],"behaviors":[27,94],"when":[28,150],"tested":[29],"outside":[30],"the":[31,67,74,77,90,109,159,175,187,191],"training":[32,75,110,160,204],"distribution.":[33],"We":[34,44,81],"study":[35],"set":[37],"interventions":[39,48,115],"proposed":[40],"reduce":[42,49],"EM.":[43,80],"confirm":[45],"that":[46,214],"these":[47],"or":[50,207],"eliminate":[51],"EM":[52],"existing":[54],"evaluations":[55,235],"(questions":[56],"like":[57],"\"How":[58],"do":[59],"I":[60],"make":[61],"quick":[63],"buck?\").":[64],"However,":[65],"if":[66,184,203,233],"evaluation":[68],"prompts":[69],"are":[70,116],"tweaked":[71],"resemble":[73],"context,":[76],"displays":[79,92],"call":[82],"this":[83],"conditional":[84,133,201],"misalignment.":[85,134],"As":[86],"in":[87,215],"standard":[88,234],"EM,":[89],"than":[97],"those":[98],"seen":[99],"during":[100],"training,":[101],"but":[102],"only":[103,143],"inputs":[105],"sharing":[106],"features":[107],"with":[108,120,170,224],"data.":[111,130],"The":[112,162],"first":[113],"two":[114],"diluting":[117],"data":[119,127,220],"benign":[121,126,225],"data,":[122,226],"and":[123],"finetuning":[124],"after":[128],"Both":[131],"produce":[132],"For":[135],"instance,":[136],"models":[137,227],"mix":[141],"5%":[144],"insecure":[145],"code":[146],"still":[147,199],"show":[148],"asked":[151],"format":[153],"responses":[154],"as":[155,179],"Python":[156],"strings":[157],"(resembling":[158],"context).":[161],"third":[163],"intervention":[164],"is":[165,205,221],"inoculation":[166,176,194],"prompting.":[167],"Here,":[168],"statements":[169],"similar":[172],"form":[173],"prompt":[177],"serve":[178],"triggers":[180],"for":[181],"misalignment,":[182],"even":[183,232],"they":[185],"have":[186],"opposite":[188],"meaning.":[189],"On":[190],"positive":[192],"side,":[193],"prompting":[195],"has":[196],"lower":[197],"(but":[198],"non-zero)":[200],"on-policy":[206],"includes":[208],"reasoning":[209],"distillation.":[210],"Our":[211],"results":[212],"imply":[213],"realistic":[216],"post-training,":[217],"where":[218],"typically":[222],"combined":[223],"may":[228],"be":[229],"conditionally":[230],"look":[236],"clean.":[237]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-30T00:00:00"}
