{"id":"https://openalex.org/W7162783883","doi":"https://doi.org/10.48550/arxiv.2605.29601","title":"Training Deliberative Monitors for Black-Box Scheming Detection","display_name":"Training Deliberative Monitors for Black-Box Scheming Detection","publication_year":2026,"publication_date":"2026-05-28","ids":{"openalex":"https://openalex.org/W7162783883","doi":"https://doi.org/10.48550/arxiv.2605.29601"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.29601","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.29601","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.29601","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137355066","display_name":"Aditya Sinha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sinha, Aditya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137379556","display_name":"Akshat Naik","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Naik, Akshat","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053292294","display_name":"Victor Gillioz","orcid":"https://orcid.org/0009-0001-7434-3239"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gillioz, Victor","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127895575","display_name":"Simon Storf","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Storf, Simon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053751350","display_name":"Kilian Merkelbach","orcid":"https://orcid.org/0000-0002-5148-3220"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Merkelbach, Kilian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128013201","display_name":"Rich Barton-Cooper","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Barton-Cooper, Rich","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114418465","display_name":"Axel H\u00f8jmark","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"H\u00f8jmark, Axel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5034914617","display_name":"Marius Hobbhahn","orcid":"https://orcid.org/0009-0003-8244-3154"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hobbhahn, Marius","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.3073999881744385,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.3073999881744385,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.24390000104904175,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.12720000743865967,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5770000219345093},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5702000260353088},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4936999976634979},{"id":"https://openalex.org/keywords/reboot","display_name":"Reboot","score":0.4542999863624573},{"id":"https://openalex.org/keywords/pareto-principle","display_name":"Pareto principle","score":0.41670000553131104},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.4163999855518341},{"id":"https://openalex.org/keywords/frontier","display_name":"Frontier","score":0.3961000144481659},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.31279999017715454}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6392999887466431},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5770000219345093},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5702000260353088},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4936999976634979},{"id":"https://openalex.org/C120524526","wikidata":"https://www.wikidata.org/wiki/Q1709148","display_name":"Reboot","level":2,"score":0.4542999863624573},{"id":"https://openalex.org/C137635306","wikidata":"https://www.wikidata.org/wiki/Q182667","display_name":"Pareto principle","level":2,"score":0.41670000553131104},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.4163999855518341},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4081000089645386},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.3961000144481659},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3666999936103821},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.31279999017715454},{"id":"https://openalex.org/C2777996765","wikidata":"https://www.wikidata.org/wiki/Q122981","display_name":"Anagram","level":3,"score":0.3043999969959259},{"id":"https://openalex.org/C51485801","wikidata":"https://www.wikidata.org/wiki/Q16966861","display_name":"Efficient frontier","level":3,"score":0.3021000027656555},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.28220000863075256},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.28119999170303345},{"id":"https://openalex.org/C166052673","wikidata":"https://www.wikidata.org/wiki/Q83021","display_name":"Empirical evidence","level":2,"score":0.2768999934196472},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.2750999927520752},{"id":"https://openalex.org/C94966114","wikidata":"https://www.wikidata.org/wiki/Q29256","display_name":"Black box","level":2,"score":0.27160000801086426},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.2689000070095062},{"id":"https://openalex.org/C138551086","wikidata":"https://www.wikidata.org/wiki/Q842271","display_name":"Marginal utility","level":2,"score":0.26420000195503235},{"id":"https://openalex.org/C203479927","wikidata":"https://www.wikidata.org/wiki/Q5165939","display_name":"Controller (irrigation)","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.29601","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.29601","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.29601","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.29601","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.638338029384613,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"autonomous":[1],"agents":[2],"become":[3,18],"more":[4],"capable":[5],"of":[6,40,202],"performing":[7],"real-world":[8],"tasks,":[9],"distinguishing":[10],"scheming":[11,64,87],"behavior":[12],"from":[13,67,93],"benign":[14],"task":[15],"pursuit":[16],"may":[17],"a":[19,86,94,100],"central":[20],"AI":[21],"control":[22],"problem.":[23],"Existing":[24],"monitors":[25,110,148,178,205,216],"often":[26],"rely":[27],"on":[28,119,208],"chain-of-thought":[29],"access":[30],"or":[31,34,46,76],"internal":[32],"activations,":[33],"use":[35],"prompted":[36,147,176,225],"frontier":[37,95,144,177,213,226],"models,":[38],"all":[39,142],"which":[41],"can":[42],"be":[43],"unavailable,":[44],"unreliable":[45],"expensive":[47],"in":[48],"deployment.":[49],"In":[50],"this":[51],"work,":[52],"we":[53,217],"study":[54],"action-only":[55],"deliberative":[56,83],"monitors:":[57],"smaller":[58],"open-weight":[59,109],"models":[60,145],"trained":[61,204],"to":[62,89,136,224],"detect":[63],"and":[65,103,114,122,154,158,186],"sabotage":[66],"agentic":[68,127],"trajectories":[69],"without":[70],"accessing":[71],"the":[72,105,209,215],"monitored":[73],"agent's":[74],"reasoning":[75],"model":[77],"internals.":[78],"Our":[79],"method,":[80],"inspired":[81],"by":[82],"alignment,":[84],"uses":[85],"specification":[88],"elicit":[90],"structured":[91],"rationales":[92,107],"teacher,":[96],"filters":[97],"them":[98],"with":[99,111],"separate":[101],"judge,":[102],"distills":[104],"highest-quality":[106],"into":[108],"supervised":[112],"fine-tuning":[113],"reinforcement":[115],"learning.":[116],"We":[117,130],"train":[118],"five":[120],"datasets,":[121],"evaluate":[123],"across":[124],"six":[125],"out-of-distribution":[126],"misalignment":[128],"benchmarks.":[129],"show":[131],"that":[132],"applying":[133],"our":[134,203],"method":[135],"Qwen3.5-27B":[137],"yields":[138],"higher":[139,191,197],"performance":[140,192],"than":[141,159],"low-cost":[143],"as":[146],"(Gemini":[149,179],"3.1":[150,180],"Flash-Lite,":[151],"GPT-5.4":[152],"Nano,":[153],"Claude":[155,183,187],"Haiku":[156],"4.5)":[157],"Gemini":[160],"2.5":[161],"Pro,":[162,181],"while":[163],"also":[164],"achieving":[165],"lower":[166],"marginal":[167,198],"inference":[168,199],"cost":[169],"(token-metered":[170],"USD":[171],"per":[172],"1,000":[173],"evaluations).":[174],"Stronger":[175],"GPT-5.4,":[182],"Sonnet":[184],"4.6,":[185],"Opus":[188],"4.6)":[189],"achieve":[190],"but":[193],"at":[194],"roughly":[195],"$16$--$34\\times$":[196],"cost.":[200],"Several":[201],"are":[206],"positioned":[207],"empirical":[210],"cost--performance":[211],"Pareto":[212],"among":[214],"evaluate,":[218],"providing":[219],"practical":[220],"low-cost,":[221],"low-FPR":[222],"alternatives":[223],"models.":[227]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-30T00:00:00"}
