{"id":"https://openalex.org/W6948180657","doi":"https://doi.org/10.48550/arxiv.2506.14261","title":"RL-Obfuscation: Can Language Models Learn to Evade Latent-Space Monitors?","display_name":"RL-Obfuscation: Can Language Models Learn to Evade Latent-Space Monitors?","publication_year":2025,"publication_date":"2025-06-17","ids":{"openalex":"https://openalex.org/W6948180657","doi":"https://doi.org/10.48550/arxiv.2506.14261"},"language":"en","primary_location":{"id":"pmh:doi:10.48550/arxiv.2506.14261","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Gupta, Rohan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gupta, Rohan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Jenner, Erik","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jenner, Erik","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12859","display_name":"Cell Image Analysis Techniques","score":0.11479999870061874,"subfield":{"id":"https://openalex.org/subfields/1304","display_name":"Biophysics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T12859","display_name":"Cell Image Analysis Techniques","score":0.11479999870061874,"subfield":{"id":"https://openalex.org/subfields/1304","display_name":"Biophysics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11289","display_name":"Single-cell and spatial transcriptomics","score":0.09000000357627869,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10895","display_name":"Species Distribution and Climate Change","score":0.08020000159740448,"subfield":{"id":"https://openalex.org/subfields/2302","display_name":"Ecological Modeling"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/evasion","display_name":"Evasion (ethics)","score":0.6654999852180481},{"id":"https://openalex.org/keywords/deception","display_name":"Deception","score":0.592199981212616},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.570900022983551},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.5199000239372253},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.4438000023365021},{"id":"https://openalex.org/keywords/threat-model","display_name":"Threat model","score":0.4205000102519989},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.3882000148296356},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.33719998598098755}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7749999761581421},{"id":"https://openalex.org/C2781251061","wikidata":"https://www.wikidata.org/wiki/Q5416089","display_name":"Evasion (ethics)","level":3,"score":0.6654999852180481},{"id":"https://openalex.org/C2779267917","wikidata":"https://www.wikidata.org/wiki/Q170028","display_name":"Deception","level":2,"score":0.592199981212616},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.570900022983551},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.5199000239372253},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5066999793052673},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.4438000023365021},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.4359000027179718},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4284999966621399},{"id":"https://openalex.org/C140547941","wikidata":"https://www.wikidata.org/wiki/Q7797194","display_name":"Threat model","level":2,"score":0.4205000102519989},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.3882000148296356},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35760000348091125},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.33719998598098755},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.3255000114440918},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.30550000071525574},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.28700000047683716},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.28529998660087585},{"id":"https://openalex.org/C28427503","wikidata":"https://www.wikidata.org/wiki/Q13580300","display_name":"Internal model","level":3,"score":0.28349998593330383},{"id":"https://openalex.org/C179603123","wikidata":"https://www.wikidata.org/wiki/Q1941921","display_name":"Modeling language","level":3,"score":0.28299999237060547},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.2540999948978424}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2506.14261","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2506.14261","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.14261","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2506.14261","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.441037118434906,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"},{"score":0.41380563378334045,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Latent-space":[0],"monitors":[1,39,87,120,172,185],"aim":[2],"to":[3,54,65,84,96,102,124,146,154,168,193,196],"detect":[4],"undesirable":[5],"behaviours":[6,30],"in":[7,28,52,76],"Large":[8],"Language":[9,97],"Models":[10,98],"by":[11,47],"leveraging":[12],"their":[13,90,107],"internal":[14,199],"representations":[15],"rather":[16],"than":[17],"relying":[18],"solely":[19],"on":[20,173],"black-box":[21],"outputs.":[22],"These":[23],"methods":[24],"have":[25,197],"shown":[26],"promise":[27],"identifying":[29],"such":[31,67,131],"as":[32,132],"deception":[33],"and":[34,105,186],"unsafe":[35],"completions.":[36],"However,":[37],"these":[38,141,184],"may":[40],"themselves":[41],"become":[42],"training":[43],"signals,":[44],"for":[45,140],"example,":[46],"using":[48],"problematic":[49],"samples":[50],"found":[51],"deployment":[53],"retrain":[55],"models.":[56],"This":[57],"raises":[58],"an":[59],"important":[60],"question:":[61],"can":[62,152,165,191],"models":[63,144,164,182],"learn":[64,192],"evade":[66,85,147,155],"monitors?":[68],"To":[69],"evaluate":[70,106],"this":[71,125],"capability,":[72],"we":[73,178],"introduce":[74],"RL-Obfuscation,":[75],"which":[77],"LLMs":[78],"are":[79,121],"finetuned":[80],"via":[81],"reinforcement":[82],"learning":[83],"latent-space":[86,171],"while":[88,127],"maintaining":[89],"blackbox":[91],"behaviour.":[92],"We":[93,116,159],"apply":[94],"RL-Obfuscation":[95],"ranging":[99],"from":[100],"7B":[101],"14B":[103],"parameters":[104],"Evasion":[108],"Success":[109],"Rate":[110],"against":[111],"a":[112,148],"suite":[113],"of":[114],"monitors.":[115,158],"find":[117,161,187],"that":[118,162,188],"token-level":[119],"highly":[122],"vulnerable":[123,142],"attack":[126],"more":[128],"holistic":[129],"monitors,":[130,143],"max-pooling":[133],"or":[134],"attention-based":[135],"probes,":[136],"remain":[137],"robust.":[138],"Moreover,":[139],"trained":[145,167],"single":[149],"static":[150],"monitor":[151],"generalise":[153],"other":[156],"unseen":[157],"also":[160],"the":[163,181,189],"be":[166],"conditionally":[169],"bypass":[170,183],"only":[174],"certain":[175],"inputs.":[176],"Finally,":[177],"study":[179],"how":[180],"model":[190],"repurpose":[194],"tokens":[195],"different":[198],"representations.":[200]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
