{"id":"https://openalex.org/W7139928198","doi":"https://doi.org/10.48550/arxiv.2603.18245","title":"Who Tests the Testers? Systematic Enumeration and Coverage Audit of LLM Agent Tool Call Safety","display_name":"Who Tests the Testers? Systematic Enumeration and Coverage Audit of LLM Agent Tool Call Safety","publication_year":2026,"publication_date":"2026-03-18","ids":{"openalex":"https://openalex.org/W7139928198","doi":"https://doi.org/10.48550/arxiv.2603.18245"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.18245","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18245","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.18245","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130226486","display_name":"Xuan Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Xuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130228352","display_name":"Lu Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Lu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130241763","display_name":"Ruqi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ruqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130217904","display_name":"Xiangyu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xiangyu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5130226486"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.36899998784065247,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.36899998784065247,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.09200000017881393,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.09059999883174896,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.707099974155426},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.6521000266075134},{"id":"https://openalex.org/keywords/complement","display_name":"Complement (music)","score":0.5770999789237976},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.5716999769210815},{"id":"https://openalex.org/keywords/completeness","display_name":"Completeness (order theory)","score":0.5550000071525574},{"id":"https://openalex.org/keywords/test","display_name":"Test (biology)","score":0.44279998540878296},{"id":"https://openalex.org/keywords/enumeration","display_name":"Enumeration","score":0.36000001430511475}],"concepts":[{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.707099974155426},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.6521000266075134},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6486999988555908},{"id":"https://openalex.org/C112313634","wikidata":"https://www.wikidata.org/wiki/Q7886648","display_name":"Complement (music)","level":5,"score":0.5770999789237976},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.5716999769210815},{"id":"https://openalex.org/C17231256","wikidata":"https://www.wikidata.org/wiki/Q5156540","display_name":"Completeness (order theory)","level":2,"score":0.5550000071525574},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.44279998540878296},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.37720000743865967},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.3610000014305115},{"id":"https://openalex.org/C156340839","wikidata":"https://www.wikidata.org/wiki/Q2704791","display_name":"Enumeration","level":2,"score":0.36000001430511475},{"id":"https://openalex.org/C128942645","wikidata":"https://www.wikidata.org/wiki/Q1568346","display_name":"Test case","level":3,"score":0.3091000020503998},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.2728999853134155},{"id":"https://openalex.org/C37945671","wikidata":"https://www.wikidata.org/wiki/Q7336207","display_name":"Risk-based testing","level":5,"score":0.26510000228881836},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2644999921321869},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C53942775","wikidata":"https://www.wikidata.org/wiki/Q1211721","display_name":"Code coverage","level":3,"score":0.25270000100135803},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.18245","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18245","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.18245","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18245","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.4394064247608185,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Model":[2],"(LLM)":[3],"agents":[4,26],"increasingly":[5],"act":[6],"through":[7,67],"external":[8],"tools,":[9],"making":[10],"their":[11],"safety":[12,99,152,164],"contingent":[13],"on":[14],"tool-call":[15,82],"workflows":[16,83],"rather":[17],"than":[18,124],"text":[19],"generation":[20],"alone.":[21],"While":[22],"recent":[23],"benchmarks":[24,103,117,131],"evaluate":[25],"across":[27],"diverse":[28,85],"environments":[29],"and":[30,44,84,104,118,154],"risk":[31],"categories,":[32],"a":[33,60,92,158],"fundamental":[34],"question":[35],"remains":[36],"unanswered:":[37],"how":[38],"complete":[39],"are":[40],"existing":[41,102,130],"test":[42,77],"suites,":[43],"what":[45],"unsafe":[46,106,127],"interaction":[47,107],"patterns":[48,108],"persist":[49],"even":[50],"after":[51],"an":[52,71],"agent":[53,163],"passes":[54],"the":[55,140],"benchmark?":[56],"We":[57],"propose":[58],"SafeAudit,":[59],"meta-audit":[61],"framework":[62],"that":[63,74,96,109,129],"addresses":[64],"this":[65],"gap":[66],"two":[68],"contributions.":[69],"First,":[70],"LLM-based":[72],"enumerator":[73],"systematically":[75],"generates":[76],"cases":[78],"by":[79],"enumerating":[80],"valid":[81],"user":[86],"scenarios.":[87],"Second,":[88],"we":[89],"introduce":[90],"rule-resistance,":[91],"non-semantic,":[93],"quantitative":[94],"metric":[95],"distills":[97],"compact":[98],"rules":[100],"from":[101],"identifies":[105],"remain":[110],"uncovered":[111],"under":[112],"those":[113],"rules.":[114],"Across":[115],"3":[116],"12":[119],"environments,":[120],"SafeAudit":[121],"uncovers":[122],"more":[123],"20%":[125],"residual":[126],"behaviors":[128],"fail":[132],"to":[133,161],"expose,":[134],"with":[135],"coverage":[136],"growing":[137],"monotonically":[138],"as":[139,157],"testing":[141],"budget":[142],"increases.":[143],"Our":[144],"results":[145],"highlight":[146],"significant":[147],"completeness":[148],"gaps":[149],"in":[150],"current":[151],"evaluation":[153],"motivate":[155],"meta-auditing":[156],"necessary":[159],"complement":[160],"benchmark-based":[162],"testing.":[165]},"counts_by_year":[],"updated_date":"2026-03-21T06:36:02.116451","created_date":"2026-03-21T00:00:00"}
