{"id":"https://openalex.org/W7126051014","doi":"https://doi.org/10.48550/arxiv.2601.20617","title":"Agent Benchmarks Fail Public Sector Requirements","display_name":"Agent Benchmarks Fail Public Sector Requirements","publication_year":2026,"publication_date":"2026-01-28","ids":{"openalex":"https://openalex.org/W7126051014","doi":"https://doi.org/10.48550/arxiv.2601.20617"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.20617","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124285623","display_name":"Jonathan Rystr\u00f8m","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Rystr\u00f8m, Jonathan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120003962","display_name":"Chris Schmitz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schmitz, Chris","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124232716","display_name":"Karolina Korgul","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Korgul, Karolina","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124213681","display_name":"Jan Batzner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Batzner, Jan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124179636","display_name":"Chris Russell","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Russell, Chris","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5124285623"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.16410000622272491,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.16410000622272491,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.14100000262260437,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10953","display_name":"E-Government and Public Services","score":0.08959999680519104,"subfield":{"id":"https://openalex.org/subfields/3320","display_name":"Political Science and International Relations"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.760200023651123},{"id":"https://openalex.org/keywords/public-sector","display_name":"Public sector","score":0.6773999929428101},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.4733999967575073},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.43369999527931213},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.31060001254081726}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.760200023651123},{"id":"https://openalex.org/C147859227","wikidata":"https://www.wikidata.org/wiki/Q294217","display_name":"Public sector","level":2,"score":0.6773999929428101},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.508400022983551},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4758000075817108},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.4733999967575073},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.43369999527931213},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.4332999885082245},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.31060001254081726},{"id":"https://openalex.org/C184356942","wikidata":"https://www.wikidata.org/wiki/Q830382","display_name":"Best practice","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.2863999903202057},{"id":"https://openalex.org/C195094911","wikidata":"https://www.wikidata.org/wiki/Q14167904","display_name":"Process management","level":1,"score":0.25940001010894775},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2540000081062317}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.20617","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.20617","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.20617","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.20617","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.6449133157730103,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Deploying":[0],"Large":[1],"Language":[2],"Model-based":[3],"agents":[4],"(LLM":[5],"agents)":[6],"in":[7],"the":[8,16,87,91,119],"public":[9,73,92,133],"sector":[10],"requires":[11],"assuring":[12],"that":[13,85,112],"they":[14,47],"meet":[15,44],"stringent":[17],"legal,":[18],"procedural,":[19],"and":[20,27,82,136],"structural":[21],"requirements":[22,89],"of":[23,72,90,118],"public-sector":[24,50,138],"institutions.":[25],"Practitioners":[26],"researchers":[28,130],"often":[29],"turn":[30],"to":[31,45,126,131,140],"benchmarks":[32,42,56,76,135],"for":[33,101,128,137],"such":[34,65],"assessments.":[35],"However,":[36],"it":[37],"remains":[38],"unclear":[39],"what":[40],"criteria":[41,66,103,143],"must":[43,77],"ensure":[46],"adequately":[48],"reflect":[49,86],"requirements,":[51],"or":[52],"how":[53],"many":[54],"existing":[55],"do":[57],"so.":[58],"In":[59],"this":[60],"paper,":[61],"we":[62],"first":[63],"define":[64],"based":[67],"on":[68],"a":[69,124],"first-principles":[70],"survey":[71],"administration":[74],"literature:":[75],"be":[78],"\\emph{process-based},":[79],"\\emph{realistic},":[80],"\\emph{public-sector-specific}":[81],"report":[83],"\\emph{metrics}":[84],"unique":[88],"sector.":[93],"We":[94],"analyse":[95],"more":[96],"than":[97],"1,300":[98],"benchmark":[99,115],"papers":[100],"these":[102,142],"using":[104],"an":[105],"expert-validated":[106],"LLM-assisted":[107],"pipeline.":[108],"Our":[109,121],"results":[110],"show":[111],"no":[113],"single":[114],"meets":[116],"all":[117],"criteria.":[120],"findings":[122],"provide":[123],"call":[125],"action":[127],"both":[129],"develop":[132],"sector-relevant":[134],"officials":[139],"apply":[141],"when":[144],"evaluating":[145],"their":[146],"own":[147],"agentic":[148],"use":[149],"cases.":[150]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-01-30T00:00:00"}
