{"id":"https://openalex.org/W7162030328","doi":"https://doi.org/10.48550/arxiv.2605.21404","title":"What Twelve LLM Agent Benchmark Papers Disclose About Themselves: A Pilot Audit and an Open Scoring Schema","display_name":"What Twelve LLM Agent Benchmark Papers Disclose About Themselves: A Pilot Audit and an Open Scoring Schema","publication_year":2026,"publication_date":"2026-05-20","ids":{"openalex":"https://openalex.org/W7162030328","doi":"https://doi.org/10.48550/arxiv.2605.21404"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.21404","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.21404","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.21404","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061940449","display_name":"Mahdi Naser Moghadasi","orcid":null},"institutions":[{"id":"https://openalex.org/I12315562","display_name":"Texas Tech University","ror":"https://ror.org/0405mnx93","country_code":"US","type":"education","lineage":["https://openalex.org/I12315562"]},{"id":"https://openalex.org/I71637028","display_name":"University of Brighton","ror":"https://ror.org/04kp2b655","country_code":"GB","type":"education","lineage":["https://openalex.org/I71637028"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Moghadasi, Mahdi Naser","raw_affiliation_strings":["BrightMind AI, Texas Tech University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"BrightMind AI, Texas Tech University","institution_ids":["https://openalex.org/I12315562","https://openalex.org/I71637028"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5074537862","display_name":"Faezeh Ghaderi","orcid":"https://orcid.org/0000-0003-1620-2662"},"institutions":[{"id":"https://openalex.org/I189196454","display_name":"The University of Texas at Arlington","ror":"https://ror.org/019kgqr73","country_code":"US","type":"education","lineage":["https://openalex.org/I189196454"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ghaderi, Faezeh","raw_affiliation_strings":["University of Texas at Arlington"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Texas at Arlington","institution_ids":["https://openalex.org/I189196454"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10081","display_name":"Auditing, Earnings Management, Governance","score":0.32109999656677246,"subfield":{"id":"https://openalex.org/subfields/1402","display_name":"Accounting"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10081","display_name":"Auditing, Earnings Management, Governance","score":0.32109999656677246,"subfield":{"id":"https://openalex.org/subfields/1402","display_name":"Accounting"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13643","display_name":"Artificial Intelligence in Law","score":0.06310000270605087,"subfield":{"id":"https://openalex.org/subfields/3320","display_name":"Political Science and International Relations"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12378","display_name":"Accounting Education and Careers","score":0.03999999910593033,"subfield":{"id":"https://openalex.org/subfields/1402","display_name":"Accounting"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/schema","display_name":"Schema (genetic algorithms)","score":0.5508000254631042},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5236999988555908},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.49160000681877136},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.43810001015663147},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.4318000078201294}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7177000045776367},{"id":"https://openalex.org/C52146309","wikidata":"https://www.wikidata.org/wiki/Q7431116","display_name":"Schema (genetic algorithms)","level":2,"score":0.5508000254631042},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5236999988555908},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.49160000681877136},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.43810001015663147},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.4318000078201294},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3610000014305115},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3474999964237213},{"id":"https://openalex.org/C41282006","wikidata":"https://www.wikidata.org/wiki/Q830689","display_name":"Auditor's report","level":3,"score":0.290800005197525},{"id":"https://openalex.org/C153701036","wikidata":"https://www.wikidata.org/wiki/Q659974","display_name":"Trustworthiness","level":2,"score":0.2903999984264374}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.21404","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.21404","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.21404","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.21404","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"score":0.4399515986442566,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,84,131,208],"read":[1],"twelve":[2,118],"well-known":[3],"LLM":[4],"agent":[5,137,184],"benchmark":[6,39,92,185],"papers":[7,32,120,160,186],"and":[8,45,47,126,142,166,193,223,250],"recorded,":[9],"dimension":[10],"by":[11,235],"dimension,":[12],"what":[13,128,253],"each":[14],"paper":[15,76],"actually":[16],"says":[17],"about":[18],"how":[19],"its":[20,140],"evaluation":[21,206],"was":[22,233],"run.":[23],"The":[24,152,231],"motivation":[25],"came":[26],"from":[27],"a":[28,86,103,149,200,213,220,229,236,242],"familiar":[29],"frustration:":[30],"two":[31],"will":[33],"report":[34,80],"results":[35],"on":[36,81,178,194],"the":[37,41,53,55,58,61,67,82,107,133,157,168,174,182,205,210,217,224,246],"same":[38,42],"with":[40,106],"model":[43],"name":[44],"disagree,":[46],"you":[48,73],"cannot":[49],"tell":[50],"why":[51],"--":[52],"scaffold,":[54],"sampling":[56],"settings,":[57,97],"subset,":[59],"or":[60],"evaluator":[62],"version.":[63],"In":[64],"many":[65],"cases":[66,109],"published":[68],"artifact":[69],"does":[70],"not":[71,139],"let":[72],"answer.":[74],"This":[75],"is":[77,161,177,245],"an":[78,136],"implementation":[79],"attempt.":[83],"designed":[85],"small":[87],"audit":[88,154,244],"schema":[89,211],"(five":[90],"fields:":[91],"identity,":[93],"harness":[94,195],"specification,":[95],"inference":[96,188],"cost":[98,179,189],"reporting,":[99],"failure":[100],"breakdown),":[101],"wrote":[102],"scoring":[104,226,232],"codebook":[105,218],"boundary":[108],"we":[110,129,251,254],"hit":[111],"during":[112],"pilot":[113],"scoring,":[114],"applied":[115],"it":[116,256],"to":[117],"canonical":[119],"(eight":[121],"agent,":[122],"four":[123,169],"classical":[124,170],"static),":[125],"recorded":[127],"saw.":[130],"score":[132,155],"disclosure":[134,147],"of":[135,164,181,204],"run,":[138],"correctness,":[141],"make":[143],"no":[144],"claim":[145],"that":[146],"implies":[148],"trustworthy":[150],"result.":[151],"mean":[153],"across":[156,167],"eight":[158,183],"agent-benchmark":[159],"0.38":[162],"(out":[163],"1.0),":[165],"static":[171],"benchmarks":[172],"0.66;":[173],"largest":[175],"gap":[176],"(none":[180,197],"disclose":[187,199],"in":[190,239],"any":[191],"form)":[192],"specification":[196],"fully":[198],"content-addressed":[201],"container":[202],"image":[203],"environment).":[207],"release":[209],"as":[212,219,228],"JSON":[214],"Schema":[215],"file,":[216],"Markdown":[221],"document,":[222],"raw":[225],"sheet":[227],"CSV.":[230],"performed":[234],"single":[237],"auditor":[238],"one":[240],"pass;":[241],"multi-rater":[243],"natural":[247],"next":[248],"step,":[249],"discuss":[252],"think":[255],"would":[257],"change.":[258]},"counts_by_year":[{"year":2026,"cited_by_count":3}],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-22T00:00:00"}
