{"id":"https://openalex.org/W7157670855","doi":"https://doi.org/10.48550/arxiv.2604.24955","title":"BenchGuard: Who Guards the Benchmarks? Automated Auditing of LLM Agent Benchmarks","display_name":"BenchGuard: Who Guards the Benchmarks? Automated Auditing of LLM Agent Benchmarks","publication_year":2026,"publication_date":"2026-04-27","ids":{"openalex":"https://openalex.org/W7157670855","doi":"https://doi.org/10.48550/arxiv.2604.24955"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.24955","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24955","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.24955","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134827241","display_name":"Xinming Tu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210161903","display_name":"China Minmetals (China)","ror":"https://ror.org/059hsy938","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210161903"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tu, Xinming","raw_affiliation_strings":["Minta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Minta","institution_ids":["https://openalex.org/I4210161903"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134865857","display_name":"Tianze Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210161903","display_name":"China Minmetals (China)","ror":"https://ror.org/059hsy938","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210161903"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wang, Tianze","raw_affiliation_strings":["Minta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Minta","institution_ids":["https://openalex.org/I4210161903"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134871260","display_name":"Yingzhou","orcid":null},"institutions":[{"id":"https://openalex.org/I4210161903","display_name":"China Minmetals (China)","ror":"https://ror.org/059hsy938","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210161903"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yingzhou","raw_affiliation_strings":["Minta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Minta","institution_ids":["https://openalex.org/I4210161903"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134846374","display_name":"Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134861906","display_name":"Kexin Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Kexin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091136830","display_name":"Yuanhao Qu","orcid":"https://orcid.org/0000-0002-7574-259X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qu, Yuanhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134861863","display_name":"Sara Mostafavi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mostafavi, Sara","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.40149998664855957,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.40149998664855957,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.09950000047683716,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.08980000019073486,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.7882999777793884},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.77920001745224},{"id":"https://openalex.org/keywords/complement","display_name":"Complement (music)","score":0.44179999828338623},{"id":"https://openalex.org/keywords/scripting-language","display_name":"Scripting language","score":0.4172999858856201},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.4140999913215637}],"concepts":[{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.7882999777793884},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.77920001745224},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.722000002861023},{"id":"https://openalex.org/C112313634","wikidata":"https://www.wikidata.org/wiki/Q7886648","display_name":"Complement (music)","level":5,"score":0.44179999828338623},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.4172999858856201},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.4140999913215637},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.4025999903678894},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.351500004529953},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.35030001401901245},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3163999915122986},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2996000051498413},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2937999963760376},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.25940001010894775}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.24955","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24955","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.24955","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24955","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.6452714800834656,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"benchmarks":[1],"grow":[2],"in":[3,96,171],"complexity,":[4],"many":[5],"apparent":[6],"agent":[7,14,63,76],"failures":[8,11,20],"are":[9,19],"not":[10,161],"of":[12,21,46,110,129,165],"the":[13,22,55,114,173],"at":[15],"all":[16,67],"-":[17,98,105],"they":[18],"benchmark":[23,68,140,155],"itself:":[24],"broken":[25],"specifications,":[26],"implicit":[27],"assumptions,":[28],"and":[29,49,106,144],"rigid":[30],"evaluation":[31,47,166,174],"scripts":[32],"that":[33,120],"penalize":[34],"valid":[35],"alternative":[36],"approaches.":[37],"We":[38],"propose":[39],"employing":[40],"frontier":[41,158],"LLMs":[42],"as":[43,81,163,168],"systematic":[44],"auditors":[45],"infrastructure,":[48],"realize":[50],"this":[51],"vision":[52],"through":[53],"BenchGuard,":[54],"first":[56],"automated":[57,139],"auditing":[58,141],"framework":[59],"for":[60],"task-oriented,":[61],"execution-based":[62],"benchmarks.":[64],"BenchGuard":[65,91],"cross-verifies":[66],"artifacts":[69],"via":[70],"structured":[71],"LLM":[72],"protocols,":[73],"optionally":[74],"incorporating":[75],"solutions":[77],"or":[78],"execution":[79],"traces":[80],"additional":[82],"diagnostic":[83],"evidence.":[84],"Deployed":[85],"on":[86,113],"two":[87],"prominent":[88],"scientific":[89],"benchmarks,":[90],"identified":[92],"12":[93],"author-confirmed":[94],"issues":[95,112],"ScienceAgentBench":[97],"including":[99],"fatal":[100],"errors":[101],"rendering":[102],"tasks":[103,133],"unsolvable":[104],"exactly":[107],"matched":[108],"83.3%":[109],"expert-identified":[111],"BIXBench":[115],"Verified-50":[116],"subset,":[117],"catching":[118],"defects":[119],"prior":[121],"human":[122,148],"review":[123],"missed":[124],"entirely.":[125],"A":[126],"full":[127],"audit":[128],"50":[130],"complex":[131],"bioinformatics":[132],"costs":[134],"under":[135],"USD":[136],"15,":[137],"making":[138],"a":[142],"practical":[143],"valuable":[145],"complement":[146],"to":[147],"review.":[149],"These":[150],"findings":[151],"point":[152],"toward":[153],"AI-assisted":[154],"development,":[156],"where":[157],"models":[159],"serve":[160],"only":[162],"subjects":[164],"but":[167],"active":[169],"participants":[170],"validating":[172],"infrastructure":[175],"itself.":[176]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-30T00:00:00"}
