{"id":"https://openalex.org/W7162694930","doi":"https://doi.org/10.48550/arxiv.2605.27898","title":"A Unified Framework for the Evaluation of LLM Agentic Capabilities","display_name":"A Unified Framework for the Evaluation of LLM Agentic Capabilities","publication_year":2026,"publication_date":"2026-05-27","ids":{"openalex":"https://openalex.org/W7162694930","doi":"https://doi.org/10.48550/arxiv.2605.27898"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.27898","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27898","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.27898","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137283587","display_name":"Pengyu Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Pengyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137291370","display_name":"Lijun Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lijun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006651724","display_name":"Yaxing Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Yaxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110711391","display_name":"Qianxin Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Qianxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137257834","display_name":"Jingyi Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jingyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137304320","display_name":"Yi Liu","orcid":"https://orcid.org/0009-0001-2358-4526"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065626573","display_name":"Tingfeng Hui","orcid":"https://orcid.org/0009-0006-3524-4904"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hui, Tingfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137272313","display_name":"Xinyu Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Xinyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137270352","display_name":"Li Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137209740","display_name":"Sen Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Sen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137288416","display_name":"Jing Shao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shao, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.3061999976634979,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.3061999976634979,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.08309999853372574,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.05900000035762787,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.7961000204086304},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7815999984741211},{"id":"https://openalex.org/keywords/extensibility","display_name":"Extensibility","score":0.4056999981403351},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.4032999873161316},{"id":"https://openalex.org/keywords/profiling","display_name":"Profiling (computer programming)","score":0.4025999903678894},{"id":"https://openalex.org/keywords/taxonomy","display_name":"Taxonomy (biology)","score":0.3749000132083893}],"concepts":[{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.7961000204086304},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7815999984741211},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7404000163078308},{"id":"https://openalex.org/C32833848","wikidata":"https://www.wikidata.org/wiki/Q4115054","display_name":"Extensibility","level":2,"score":0.4056999981403351},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.4032999873161316},{"id":"https://openalex.org/C187191949","wikidata":"https://www.wikidata.org/wiki/Q1138496","display_name":"Profiling (computer programming)","level":2,"score":0.4025999903678894},{"id":"https://openalex.org/C58642233","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Taxonomy (biology)","level":2,"score":0.3749000132083893},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.34369999170303345},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.32919999957084656},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.3000999987125397},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.289000004529953},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C53619493","wikidata":"https://www.wikidata.org/wiki/Q4787093","display_name":"Architecture framework","level":3,"score":0.2718999981880188},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.2662000060081482}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.27898","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27898","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.27898","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27898","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"LLMs":[1],"are":[2,223],"increasingly":[3],"deployed":[4],"as":[5,40,212],"agents,":[6],"reliable":[7],"assessment":[8],"of":[9,43,59],"their":[10],"agentic":[11,61],"capabilities":[12,201],"has":[13],"become":[14],"essential.":[15],"However,":[16],"reported":[17],"benchmark":[18,30,189],"scores":[19],"often":[20],"jointly":[21],"reflect":[22],"model":[23],"capability":[24],"and":[25,90,108,136,141,160,163,172,184,204,220],"the":[26,44,56,69,120],"implementation":[27],"choices":[28],"each":[29,124],"is":[31],"packaged":[32],"with,":[33],"making":[34],"cross-benchmark":[35],"results":[36,179],"difficult":[37],"to":[38,197],"interpret":[39],"clean":[41],"measurements":[42],"underlying":[45],"model.":[46],"In":[47],"this":[48,146],"work,":[49],"we":[50,118,148],"present":[51],"a":[52,65,75,82,87,137,165,213],"unified":[53,66,131],"framework":[54,70,106,196],"for":[55,133,139,216],"fair":[57],"evaluation":[58,121],"LLM":[60,200],"capabilities.":[62],"Driven":[63],"by":[64],"configuration":[67],"system,":[68],"integrates":[71],"diverse":[72],"benchmarks":[73,153,221],"into":[74],"standardized":[76],"instruction--tool--environment":[77],"format,":[78],"executes":[79],"agents":[80],"through":[81],"fixed":[83],"ReAct-style":[84],"architecture":[85],"within":[86],"controllable":[88],"sandbox,":[89],"provides":[91],"an":[92],"optional":[93],"offline":[94],"setting":[95],"that":[96,105,181],"replaces":[97],"volatile":[98],"live":[99],"environments":[100],"with":[101],"curated":[102],"snapshots,":[103],"so":[104],"effects":[107,110],"environment":[109],"can":[111],"be":[112],"analyzed":[113],"separately.":[114],"Building":[115],"on":[116,175],"this,":[117],"unify":[119],"methodology":[122],"under":[123],"benchmark's":[125],"original":[126],"task-success":[127],"criteria,":[128],"while":[129],"introducing":[130],"metrics":[132],"resource":[134],"consumption":[135],"taxonomy":[138],"decision-":[140],"execution-level":[142],"failure":[143],"attribution.":[144],"Within":[145],"framework,":[147],"adapt":[149],"7":[150],"widely":[151],"used":[152],"spanning":[154],"24":[155],"domains":[156],"across":[157],"single-agent,":[158],"multi-agent,":[159],"safety-critical":[161,217],"scenarios,":[162],"conduct":[164],"large-scale":[166],"empirical":[167],"analysis":[168],"over":[169],"400K":[170],"rollouts":[171],"5B":[173],"tokens":[174],"15":[176],"models.":[177],"The":[178],"show":[180],"scaffold":[182],"choice":[183],"environmental":[185],"volatility":[186],"materially":[187],"shift":[188],"outcomes":[190],"in":[191],"both":[192],"directions,":[193],"allowing":[194],"our":[195],"disentangle":[198],"intrinsic":[199],"from":[202],"framework-":[203],"environment-induced":[205],"artifacts.":[206],"We":[207],"further":[208],"demonstrate":[209],"its":[210],"extensibility":[211],"secure":[214],"testbed":[215],"domains.":[218],"Codes":[219],"at":[222,225],"available":[224],"https://github.com/whfeLingYu/A-Unified-Framework-for-the-Evaluation-of-LLM-Agentic-Capabilities,":[226],"https://huggingface.co/AgentFramework/Unified_Farmework.":[227]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-29T00:00:00"}
