{"id":"https://openalex.org/W7154220877","doi":"https://doi.org/10.48550/arxiv.2604.11806","title":"Detecting Safety Violations Across Many Agent Traces","display_name":"Detecting Safety Violations Across Many Agent Traces","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154220877","doi":"https://doi.org/10.48550/arxiv.2604.11806"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11806","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11806","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11806","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133578051","display_name":"Adam Stein","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stein, Adam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133614947","display_name":"Davis Brown","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brown, Davis","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133624540","display_name":"Hamed Hassani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hassani, Hamed","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133555863","display_name":"Mayur Naik","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Naik, Mayur","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133581608","display_name":"Eric Wong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wong, Eric","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.4690999984741211,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.4690999984741211,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10927","display_name":"Access Control and Trust","score":0.08219999819993973,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.07559999823570251,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/covert","display_name":"Covert","score":0.6554999947547913},{"id":"https://openalex.org/keywords/cheating","display_name":"Cheating","score":0.6330999732017517},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6290000081062317},{"id":"https://openalex.org/keywords/hacker","display_name":"Hacker","score":0.6014000177383423},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.44530001282691956},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.4399000108242035},{"id":"https://openalex.org/keywords/deception","display_name":"Deception","score":0.39890000224113464},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.3986000120639801}],"concepts":[{"id":"https://openalex.org/C2779338814","wikidata":"https://www.wikidata.org/wiki/Q5179285","display_name":"Covert","level":2,"score":0.6554999947547913},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6514000296592712},{"id":"https://openalex.org/C2778024590","wikidata":"https://www.wikidata.org/wiki/Q2357432","display_name":"Cheating","level":2,"score":0.6330999732017517},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6290000081062317},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.6086999773979187},{"id":"https://openalex.org/C86844869","wikidata":"https://www.wikidata.org/wiki/Q2798820","display_name":"Hacker","level":2,"score":0.6014000177383423},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.44530001282691956},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.4399000108242035},{"id":"https://openalex.org/C2779267917","wikidata":"https://www.wikidata.org/wiki/Q170028","display_name":"Deception","level":2,"score":0.39890000224113464},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.3986000120639801},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.3779999911785126},{"id":"https://openalex.org/C2781251061","wikidata":"https://www.wikidata.org/wiki/Q5416089","display_name":"Evasion (ethics)","level":3,"score":0.3619000017642975},{"id":"https://openalex.org/C75291252","wikidata":"https://www.wikidata.org/wiki/Q1315756","display_name":"TRACE (psycholinguistics)","level":2,"score":0.349700003862381},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.31439998745918274},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C35525427","wikidata":"https://www.wikidata.org/wiki/Q745881","display_name":"Intrusion detection system","level":2,"score":0.27000001072883606},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2630999982357025},{"id":"https://openalex.org/C541664917","wikidata":"https://www.wikidata.org/wiki/Q14001","display_name":"Malware","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.25290000438690186},{"id":"https://openalex.org/C2780262971","wikidata":"https://www.wikidata.org/wiki/Q44554","display_name":"Law enforcement","level":2,"score":0.25270000100135803},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11806","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11806","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11806","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11806","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.7953433990478516}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"To":[0],"identify":[1],"safety":[2,140],"violations,":[3],"auditors":[4],"often":[5,20],"search":[6,14,97,107],"over":[7,142],"large":[8,78],"sets":[9],"of":[10,111,139,160],"agent":[11,152],"traces.":[12],"This":[13],"is":[15],"difficult":[16],"because":[17],"failures":[18,64,117],"are":[19,34,84],"rare,":[21],"complex,":[22],"and":[23,28,51,81,108,131,154],"sometimes":[24],"even":[25],"adversarially":[26],"hidden":[27],"only":[29,66],"detectable":[30],"when":[31],"multiple":[32],"traces":[33],"analyzed":[35],"together.":[36],"These":[37],"challenges":[38],"arise":[39],"in":[40,102],"diverse":[41],"settings":[42],"such":[43],"as":[44],"misuse":[45],"campaigns,":[46],"covert":[47],"sabotage,":[48],"reward":[49,161],"hacking,":[50],"prompt":[52],"injection.":[53],"Existing":[54],"approaches":[55],"struggle":[56],"here":[57],"for":[58],"several":[59],"reasons.":[60],"Per-trace":[61],"judges":[62],"miss":[63],"that":[65],"become":[67],"visible":[68],"across":[69],"traces,":[70],"naive":[71],"agentic":[72,96],"auditing":[73],"does":[74],"not":[75],"scale":[76],"to":[77,86,98],"trace":[79],"collections,":[80],"fixed":[82,123],"monitors":[83],"brittle":[85],"unanticipated":[87],"behaviors.":[88],"We":[89],"introduce":[90],"Meerkat,":[91],"which":[92],"combines":[93],"clustering":[94],"with":[95],"uncover":[99],"violations":[100,141],"specified":[101],"natural":[103],"language.":[104],"Through":[105],"structured":[106],"adaptive":[109],"investigation":[110],"promising":[112],"regions,":[113],"Meerkat":[114,135],"finds":[115,155],"sparse":[116],"without":[118],"relying":[119],"on":[120,149,163],"seed":[121],"scenarios,":[122],"workflows,":[124],"or":[125],"exhaustive":[126],"enumeration.":[127],"Across":[128],"misuse,":[129],"misalignment,":[130],"task":[132],"gaming":[133],"settings,":[134],"significantly":[136],"improves":[137],"detection":[138],"baseline":[143],"monitors,":[144],"discovers":[145],"widespread":[146],"developer":[147],"cheating":[148],"a":[150],"top":[151],"benchmark,":[153],"nearly":[156],"4x":[157],"more":[158],"examples":[159],"hacking":[162],"CyBench":[164],"than":[165],"previous":[166],"audits.":[167]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-15T00:00:00"}
