{"id":"https://openalex.org/W7161004569","doi":"https://doi.org/10.48550/arxiv.2605.11378","title":"An Empirical Study of Automating Agent Evaluation","display_name":"An Empirical Study of Automating Agent Evaluation","publication_year":2026,"publication_date":"2026-05-12","ids":{"openalex":"https://openalex.org/W7161004569","doi":"https://doi.org/10.48550/arxiv.2605.11378"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.11378","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11378","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.11378","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136028066","display_name":"Kang Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhou, Kang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058640072","display_name":"Sangmin Woo","orcid":"https://orcid.org/0000-0003-4451-9675"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Woo, Sangmin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124000509","display_name":"Haibo Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Haibo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124049127","display_name":"Kiran Ramnath","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ramnath, Kiran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056578172","display_name":"Subramanian Chidambaram","orcid":"https://orcid.org/0000-0001-8627-9898"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chidambaram, Subramanian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073331205","display_name":"Aosong Feng","orcid":"https://orcid.org/0000-0002-5474-3796"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Aosong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136046910","display_name":"Vinayak Arannil","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Arannil, Vinayak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136005671","display_name":"Muhyun Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Muhyun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136045937","display_name":"Ishan Singh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Ishan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136056833","display_name":"Darren Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Darren","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136086508","display_name":"Zhichao Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zhichao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136050277","display_name":"Megha Gandhi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gandhi, Megha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074257963","display_name":"Nirmal Prabhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Prabhu, Nirmal","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136059450","display_name":"Soumya Smruti Mishra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mishra, Soumya Smruti","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136017596","display_name":"Vivek Singh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Vivek","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109819293","display_name":"Gouri Pandeshwar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pandeshwar, Gouri","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136079534","display_name":"Lin Lee Cheong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheong, Lin Lee","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":17,"corresponding_author_ids":["https://openalex.org/A5136028066"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.15479999780654907,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.15479999780654907,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.12470000237226486,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.06340000033378601,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.7282000184059143},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.5631999969482422},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.46880000829696655},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4399999976158142},{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.4147999882698059},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.41200000047683716},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.39730000495910645},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.3416999876499176}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7602999806404114},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.7282000184059143},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.5631999969482422},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.46880000829696655},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4399999976158142},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.4147999882698059},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.41370001435279846},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.41200000047683716},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.39730000495910645},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3898000121116638},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3734000027179718},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.3416999876499176},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.3375000059604645},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3330000042915344},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3003999888896942},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.275299996137619},{"id":"https://openalex.org/C3018395757","wikidata":"https://www.wikidata.org/wiki/Q1379672","display_name":"Evaluation methods","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2565999925136566},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.11378","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11378","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.11378","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11378","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.46357303857803345}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Agent":[0],"evaluation":[1,29,46,91,95,99,120,148,163,203],"requires":[2],"assessing":[3],"complex":[4,209],"multi-step":[5],"behaviors":[6],"involving":[7],"tool":[8],"use":[9],"and":[10,16,58,105,107,126,150,167,189],"intermediate":[11],"reasoning,":[12],"making":[13],"it":[14],"costly":[15],"expertise-intensive.":[17],"A":[18],"natural":[19],"question":[20],"arises:":[21],"can":[22],"frontier":[23,48],"coding":[24,37,49,70],"assistants":[25,38,50],"reliably":[26],"automate":[27],"this":[28,42],"process?":[30],"Our":[31,175],"study":[32],"shows":[33],"that":[34,68,86,112,178,202],"simply":[35],"prompting":[36],"is":[39],"insufficient":[40],"for":[41,207],"task.":[43],"Without":[44],"domain-specific":[45],"knowledge,":[47],"achieve":[51],"only":[52],"a":[53,115,135,140],"30%":[54],"execution":[55],"success":[56],"rate":[57],"produce":[59],"over-engineered":[60],"evaluations":[61],"averaging":[62],"12+":[63],"metrics":[64],"per":[65],"agent,":[66],"indicating":[67],"strong":[69],"ability":[71],"does":[72],"not":[73],"automatically":[74],"translate":[75],"to":[76,159,187,215,220],"reliable":[77],"agent":[78,90],"evaluation.":[79],"We":[80,153],"introduce":[81,134],"EvalAgent,":[82],"an":[83],"AI":[84],"assistant":[85],"automates":[87],"the":[88,156,172],"end-to-end":[89],"pipeline.":[92],"EvalAgent":[93,179],"encodes":[94],"domain":[96],"expertise":[97],"as":[98],"skills":[100,204],"(procedural":[101],"instructions,":[102],"reusable":[103],"code":[104,164],"templates,":[106],"dynamically":[108],"retrieved":[109],"API":[110],"documentation)":[111],"compose":[113],"into":[114],"trace-based":[116],"pipeline":[117],"producing":[118],"complete":[119],"artifacts":[121],"including":[122],"metrics,":[123],"executable":[124],"code,":[125],"reports.":[127],"To":[128],"systematically":[129],"assess":[130],"generated":[131,162],"evaluations,":[132,182],"we":[133],"meta-evaluation":[136],"framework":[137],"alongside":[138],"AgentEvalBench,":[139],"benchmark":[141],"comprising":[142],"20":[143],"agents,":[144],"each":[145],"paired":[146],"with":[147],"requirements":[149],"test":[151],"scenarios.":[152],"further":[154],"propose":[155],"Eval@1":[157,184,214],"metric":[158],"measure":[160],"whether":[161],"both":[165],"executes":[166],"yields":[168],"meaningful":[169],"results":[170],"on":[171],"first":[173],"run.":[174],"experiments":[176],"show":[177,201],"produces":[180],"focused":[181],"improving":[183],"from":[185,218],"17.5%":[186],"65%,":[188],"achieving":[190],"79.5%":[191],"human":[192],"expert":[193],"preference":[194],"over":[195],"baseline":[196],"approaches.":[197],"Further":[198],"ablation":[199],"studies":[200],"are":[205],"critical":[206],"handling":[208],"evaluation:":[210],"removing":[211],"them":[212],"causes":[213],"drop":[216],"significantly":[217],"65%":[219],"30%.":[221]},"counts_by_year":[],"updated_date":"2026-05-14T06:16:12.342656","created_date":"2026-05-14T00:00:00"}
