{"id":"https://openalex.org/W7140434466","doi":"https://doi.org/10.48550/arxiv.2603.23806","title":"Willful Disobedience: Automatically Detecting Failures in Agentic Traces","display_name":"Willful Disobedience: Automatically Detecting Failures in Agentic Traces","publication_year":2026,"publication_date":"2026-03-25","ids":{"openalex":"https://openalex.org/W7140434466","doi":"https://doi.org/10.48550/arxiv.2603.23806"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.23806","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23806","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.23806","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130699275","display_name":"Reshabh K Sharma","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sharma, Reshabh K","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034064916","display_name":"Shraddha Barke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Barke, Shraddha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130691879","display_name":"Benjamin Zorn","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zorn, Benjamin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5130699275"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.3801000118255615,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.3801000118255615,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.10320000350475311,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10639","display_name":"Advanced Software Engineering Methodologies","score":0.06469999998807907,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.7350000143051147},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.507099986076355},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.4659000039100647},{"id":"https://openalex.org/keywords/software-agent","display_name":"Software agent","score":0.37560001015663147},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.37439998984336853},{"id":"https://openalex.org/keywords/formal-specification","display_name":"Formal specification","score":0.3228999972343445},{"id":"https://openalex.org/keywords/domain-knowledge","display_name":"Domain knowledge","score":0.3165999948978424}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7605000138282776},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.7350000143051147},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.507099986076355},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.4964999854564667},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.4659000039100647},{"id":"https://openalex.org/C5894958","wikidata":"https://www.wikidata.org/wiki/Q2297769","display_name":"Software agent","level":2,"score":0.37560001015663147},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.37439998984336853},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.374099999666214},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35190001130104065},{"id":"https://openalex.org/C116253237","wikidata":"https://www.wikidata.org/wiki/Q1437424","display_name":"Formal specification","level":2,"score":0.3228999972343445},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.3165999948978424},{"id":"https://openalex.org/C149091818","wikidata":"https://www.wikidata.org/wiki/Q2429814","display_name":"Software system","level":3,"score":0.295199990272522},{"id":"https://openalex.org/C2984968299","wikidata":"https://www.wikidata.org/wiki/Q1077784","display_name":"Software tool","level":3,"score":0.2939000129699707},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2849000096321106},{"id":"https://openalex.org/C529173508","wikidata":"https://www.wikidata.org/wiki/Q638608","display_name":"Software development","level":3,"score":0.2766000032424927},{"id":"https://openalex.org/C110251889","wikidata":"https://www.wikidata.org/wiki/Q1569697","display_name":"Model checking","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C63882131","wikidata":"https://www.wikidata.org/wiki/Q17122954","display_name":"Strengths and weaknesses","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C41550386","wikidata":"https://www.wikidata.org/wiki/Q529909","display_name":"Multi-agent system","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C111498074","wikidata":"https://www.wikidata.org/wiki/Q173326","display_name":"Formal verification","level":2,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.23806","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23806","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.23806","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23806","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7928378582000732,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"AI":[0],"agents":[1],"are":[2,117],"increasingly":[3],"embedded":[4],"in":[5,95],"real":[6],"software":[7],"systems,":[8],"where":[9],"they":[10],"execute":[11],"multi-step":[12],"workflows":[13],"through":[14],"multi-turn":[15],"dialogue,":[16],"tool":[17,45,58],"invocations,":[18],"and":[19,72,98,112,130,138],"intermediate":[20],"decisions.":[21],"These":[22],"long":[23],"execution":[24],"histories,":[25],"called":[26],"agentic":[27,63],"traces,":[28],"make":[29],"validation":[30],"difficult.":[31],"Outcome-only":[32],"benchmarks":[33],"can":[34],"miss":[35],"critical":[36],"procedural":[37],"failures,":[38],"such":[39],"as":[40],"incorrect":[41],"workflow":[42],"routing,":[43],"unsafe":[44],"usage,":[46],"or":[47],"violations":[48,115],"of":[49,145],"prompt-specified":[50],"rules.":[51],"This":[52],"paper":[53],"presents":[54],"AgentPex,":[55],"an":[56],"AI-powered":[57],"designed":[59],"to":[60,79,134],"systematically":[61],"evaluate":[62,81,86],"traces.":[64],"AgentPex":[65,87,106,146],"extracts":[66],"behavioral":[67],"rules":[68],"from":[69,91],"agent":[70,108,136],"prompts":[71],"system":[73],"instructions,":[74],"then":[75],"uses":[76],"these":[77],"specifications":[78],"automatically":[80],"traces":[82,90],"for":[83],"compliance.":[84],"We":[85],"on":[88],"424":[89],"$\u03c4^2$-bench":[92],"across":[93,110],"models":[94,111],"telecom,":[96],"retail,":[97],"airline":[99],"customer":[100],"service.":[101],"Our":[102],"results":[103],"show":[104],"that":[105,116],"distinguishes":[107],"behavior":[109],"surfaces":[113],"specification":[114],"not":[118],"captured":[119],"by":[120,128],"outcome-only":[121],"scoring.":[122],"It":[123],"also":[124],"provides":[125],"fine-grained":[126],"analysis":[127],"domain":[129],"metric,":[131],"enabling":[132],"developers":[133],"understand":[135],"strengths":[137],"weaknesses":[139],"at":[140,149],"scale.":[141],"The":[142],"source":[143],"code":[144],"is":[147],"available":[148],"https://github.com/microsoft/agentpex.":[150]},"counts_by_year":[],"updated_date":"2026-05-13T06:04:23.736269","created_date":"2026-03-27T00:00:00"}
