{"id":"https://openalex.org/W7133318924","doi":"https://doi.org/10.48550/arxiv.2603.00829","title":"Constitutional Black-Box Monitoring for Scheming in LLM Agents","display_name":"Constitutional Black-Box Monitoring for Scheming in LLM Agents","publication_year":2026,"publication_date":"2026-02-28","ids":{"openalex":"https://openalex.org/W7133318924","doi":"https://doi.org/10.48550/arxiv.2603.00829"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00829","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00829","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00829","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127895575","display_name":"Simon Storf","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Storf, Simon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128013201","display_name":"Rich Barton-Cooper","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Barton-Cooper, Rich","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127877563","display_name":"James Peters-Gill","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peters-Gill, James","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5034914617","display_name":"Marius Hobbhahn","orcid":"https://orcid.org/0009-0003-8244-3154"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hobbhahn, Marius","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.12620000541210175,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.12620000541210175,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.12039999663829803,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.08500000089406967,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/gloom","display_name":"Gloom","score":0.6347000002861023},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4758000075817108},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4404999911785126},{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.4074999988079071},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.4072999954223633},{"id":"https://openalex.org/keywords/limit","display_name":"Limit (mathematics)","score":0.3626999855041504},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.359499990940094}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7141000032424927},{"id":"https://openalex.org/C2778246783","wikidata":"https://www.wikidata.org/wiki/Q5571261","display_name":"Gloom","level":2,"score":0.6347000002861023},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4758000075817108},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4404999911785126},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.4074999988079071},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.4072999954223633},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3968000113964081},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.3626999855041504},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.359499990940094},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.3513999879360199},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.3449999988079071},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.33230000734329224},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3312999904155731},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.3156000077724457},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3084999918937683},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3034000098705292},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.2842000126838684},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.28130000829696655},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00829","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00829","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00829","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00829","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"score":0.7204747200012207,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Safe":[0],"deployment":[1],"of":[2,123,171],"Large":[3],"Language":[4],"Model":[5],"(LLM)":[6],"agents":[7,22,127],"in":[8,129,161],"autonomous":[9],"settings":[10],"requires":[11],"reliable":[12],"oversight":[13],"mechanisms.":[14],"A":[15],"central":[16],"challenge":[17],"is":[18,33],"detecting":[19],"scheming,":[20],"where":[21,126],"covertly":[23],"pursue":[24],"misaligned":[25],"goals.":[26],"One":[27],"approach":[28],"to":[29,39,145,186],"mitigating":[30],"such":[31],"risks":[32],"LLM-based":[34],"monitoring:":[35],"using":[36,56],"language":[37],"models":[38],"examine":[40],"agent":[41,79],"behaviors":[42],"for":[43,76],"suspicious":[44],"actions.":[45],"We":[46,72,95],"study":[47],"constitutional":[48],"black-box":[49],"monitors:":[50],"prompted":[51],"classifiers":[52],"that":[53,136,157],"detect":[54],"scheming":[55,152],"only":[57],"externally":[58],"observable":[59],"inputs":[60],"and":[61,84,108,112,183],"outputs,":[62],"optimized":[63],"on":[64,100,115,140],"synthetic":[65,78,141],"data":[66,142],"generated":[67],"from":[68,88,119],"natural-language":[69],"behavior":[70],"specifications.":[71],"introduce":[73],"two":[74],"pipelines":[75],"generating":[77],"trajectories,":[80],"STRIDE":[81],"(iterative":[82],"refinement)":[83],"Gloom":[85],"(agent-environment":[86],"simulation),":[87],"which":[89],"we":[90,155],"generate":[91],"1,000":[92],"samples":[93],"each.":[94],"optimize":[96],"frontier":[97],"LLM":[98],"monitors":[99,137],"these":[101],"datasets":[102],"via":[103],"prompt":[104,110,166],"sweeps,":[105],"human":[106],"refinement,":[107],"automated":[109],"optimization,":[111],"evaluate":[113],"performance":[114,158],"7,500":[116],"held-out":[117],"trajectories":[118],"ControlArena,":[120],"a":[121,150],"suite":[122],"grounded":[124],"environments":[125],"operate":[128],"more":[130,146,172],"realistic":[131,147],"contexts.":[132],"Our":[133],"results":[134,170],"demonstrate":[135],"selected":[138],"purely":[139],"can":[143],"generalize":[144],"environments,":[148],"capturing":[149],"meaningful":[151],"signal.":[153],"However,":[154],"find":[156],"saturates":[159],"quickly":[160],"our":[162],"setting,":[163],"with":[164],"simple":[165],"sweeps":[167],"matching":[168],"the":[169],"extensive":[173],"optimization.":[174],"Pushing":[175],"beyond":[176],"this":[177],"limit":[178],"yields":[179],"no":[180],"further":[181],"improvements":[182],"instead":[184],"leads":[185],"overfitting.":[187]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-03-04T00:00:00"}
