{"id":"https://openalex.org/W4404356429","doi":"https://doi.org/10.48550/arxiv.2411.03336","title":"Towards evaluations-based safety cases for AI scheming","display_name":"Towards evaluations-based safety cases for AI scheming","publication_year":2024,"publication_date":"2024-10-29","ids":{"openalex":"https://openalex.org/W4404356429","doi":"https://doi.org/10.48550/arxiv.2411.03336"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2411.03336","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.03336","pdf_url":"https://arxiv.org/pdf/2411.03336","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2411.03336","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5092825639","display_name":"Mikita Balesni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Balesni, Mikita","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034914617","display_name":"Marius Hobbhahn","orcid":"https://orcid.org/0009-0003-8244-3154"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hobbhahn, Marius","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059597480","display_name":"David Lindner","orcid":"https://orcid.org/0000-0001-7051-7433"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lindner, David","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084032027","display_name":"Alexander Meinke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meinke, Alexander","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114338125","display_name":"Tomek Korbak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Korbak, Tomek","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114638728","display_name":"Joshua Clymer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Clymer, Joshua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114638729","display_name":"Buck Shlegeris","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shlegeris, Buck","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064744991","display_name":"J\u00e9r\u00e9my Scheurer","orcid":"https://orcid.org/0000-0002-6859-6029"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Scheurer, J\u00e9r\u00e9my","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089079497","display_name":"Charlotte Stix","orcid":"https://orcid.org/0000-0001-5562-9234"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stix, Charlotte","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114638727","display_name":"Rusheb Shah","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shah, Rusheb","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026166476","display_name":"Nicholas Goldowsky-Dill","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goldowsky-Dill, Nicholas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005760923","display_name":"Dan Braun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Braun, Dan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013514468","display_name":"Bilal Chughtai","orcid":"https://orcid.org/0000-0002-0515-2578"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chughtai, Bilal","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089789418","display_name":"Owain Evans","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Evans, Owain","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051966605","display_name":"Daniel Kokotajlo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kokotajlo, Daniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5032905139","display_name":"Lucius Bushnaq","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bushnaq, Lucius","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":16,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13295","display_name":"Safety Systems Engineering in Autonomy","score":0.9932000041007996,"subfield":{"id":"https://openalex.org/subfields/2213","display_name":"Safety, Risk, Reliability and Quality"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13295","display_name":"Safety Systems Engineering in Autonomy","score":0.9932000041007996,"subfield":{"id":"https://openalex.org/subfields/2213","display_name":"Safety, Risk, Reliability and Quality"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10809","display_name":"Occupational Health and Safety Research","score":0.9904999732971191,"subfield":{"id":"https://openalex.org/subfields/3614","display_name":"Radiological and Ultrasound Technology"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11357","display_name":"Risk and Safety Analysis","score":0.984499990940094,"subfield":{"id":"https://openalex.org/subfields/1804","display_name":"Statistics, Probability and Uncertainty"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.38827279210090637}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.38827279210090637}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2411.03336","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.03336","pdf_url":"https://arxiv.org/pdf/2411.03336","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2411.03336","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2411.03336","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2411.03336","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.03336","pdf_url":"https://arxiv.org/pdf/2411.03336","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4404356429.pdf","grobid_xml":"https://content.openalex.org/works/W4404356429.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"We":[0],"sketch":[1,70],"how":[2,71,156],"developers":[3,92,173],"of":[4,93,105,119,181],"frontier":[5,94],"AI":[6,20,37,95,100,114,135,144,166],"systems":[7,38,96,101,115,136,145],"could":[8,39,60,73,97,111,128],"construct":[9],"a":[10,14,32],"structured":[11],"rationale":[12],"--":[13,17],"'safety":[15],"case'":[16],"that":[18,57,99,113,130,164,179],"an":[19,165],"system":[21,167],"is":[22,31,168],"unlikely":[23],"to":[24,64,84,87,148,185,195],"cause":[25],"catastrophic":[26],"outcomes":[27,140],"through":[28,122],"scheming.":[29,65],"Scheming":[30],"potential":[33],"threat":[34],"model":[35],"where":[36],"pursue":[40],"misaligned":[41],"goals":[42],"covertly,":[43],"hiding":[44],"their":[45],"true":[46],"capabilities":[47],"and":[48,79,197],"objectives.":[49],"In":[50],"this":[51],"report,":[52],"we":[53,69,154,176],"propose":[54],"three":[55],"arguments":[56,189],"safety":[58,157,188],"cases":[59,158],"use":[61],"in":[62],"relation":[63],"For":[66],"each":[67],"argument":[68],"evidence":[72,163],"be":[74,85,160],"gathered":[75],"from":[76],"empirical":[77],"evaluations,":[78],"what":[80],"assumptions":[81,183],"would":[82,137],"need":[83],"met":[86],"provide":[88],"strong":[89],"assurance.":[90],"First,":[91],"argue":[98,112,129],"are":[102,116],"not":[103,117,191],"capable":[104,118],"scheming":[106,123],"(Scheming":[107],"Inability).":[108,125],"Second,":[109],"one":[110,127],"posing":[120],"harm":[121],"(Harm":[124,151],"Third,":[126],"control":[131],"measures":[132],"around":[133],"the":[134,143,182],"prevent":[138],"unacceptable":[139],"even":[141],"if":[142],"intentionally":[146],"attempted":[147],"subvert":[149],"them":[150],"Control).":[152],"Additionally,":[153],"discuss":[155],"might":[159],"supported":[161],"by":[162],"reasonably":[169],"aligned":[170],"with":[171],"its":[172],"(Alignment).":[174],"Finally,":[175],"point":[177],"out":[178],"many":[180],"required":[184],"make":[186],"these":[187],"have":[190],"been":[192],"confidently":[193],"satisfied":[194],"date":[196],"require":[198],"making":[199],"progress":[200],"on":[201],"multiple":[202],"open":[203],"research":[204],"problems.":[205]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
