{"id":"https://openalex.org/W7124742307","doi":"https://doi.org/10.48550/arxiv.2601.11044","title":"AgencyBench: Benchmarking the Frontiers of Autonomous Agents in 1M-Token Real-World Contexts","display_name":"AgencyBench: Benchmarking the Frontiers of Autonomous Agents in 1M-Token Real-World Contexts","publication_year":2026,"publication_date":"2026-01-16","ids":{"openalex":"https://openalex.org/W7124742307","doi":"https://doi.org/10.48550/arxiv.2601.11044"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.11044","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.11044","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.11044","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101829323","display_name":"Keyu Li","orcid":"https://orcid.org/0000-0001-8893-1261"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Keyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123313103","display_name":"Junhao Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Junhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123296959","display_name":"Yang Xiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123329126","display_name":"Mohan Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Mohan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122918842","display_name":"Jie Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Jie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123356914","display_name":"Yunze Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yunze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123293141","display_name":"Shijie Xia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Dayuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xia, Shijie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xia, Shijie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cai, Xiaojie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Xiaojie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123305940","display_name":"Weiye Si","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Tianze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123311657","display_name":"Wenjie Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Si, Weiye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074839390","display_name":"D. Y. Wang","orcid":"https://orcid.org/0000-0001-8270-8448"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Wenjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Dequan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Dequan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Liu, Pengfei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Pengfei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5101829323"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.16820000112056732,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.16820000112056732,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.11159999668598175,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.09449999779462814,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.8015000224113464},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.7232999801635742},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6866000294685364},{"id":"https://openalex.org/keywords/sandbox","display_name":"Sandbox (software development)","score":0.6029000282287598},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5654000043869019},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.4803999960422516},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.40389999747276306},{"id":"https://openalex.org/keywords/core","display_name":"Core (optical fiber)","score":0.39579999446868896}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.8015000224113464},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7422000169754028},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.7232999801635742},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6866000294685364},{"id":"https://openalex.org/C167981075","wikidata":"https://www.wikidata.org/wiki/Q2667186","display_name":"Sandbox (software development)","level":2,"score":0.6029000282287598},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5654000043869019},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.4803999960422516},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4235000014305115},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.41370001435279846},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.40389999747276306},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.39579999446868896},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.36899998784065247},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3653999865055084},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.35510000586509705},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.34220001101493835},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.32330000400543213},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.3133000135421753},{"id":"https://openalex.org/C2780609101","wikidata":"https://www.wikidata.org/wiki/Q17156588","display_name":"Resource management (computing)","level":2,"score":0.3021000027656555},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.2888000011444092},{"id":"https://openalex.org/C13687954","wikidata":"https://www.wikidata.org/wiki/Q4826847","display_name":"Autonomous agent","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.26669999957084656},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.2597000002861023},{"id":"https://openalex.org/C149810388","wikidata":"https://www.wikidata.org/wiki/Q5374873","display_name":"Emulation","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C114073186","wikidata":"https://www.wikidata.org/wiki/Q2631895","display_name":"Automated planning and scheduling","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.11044","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.11044","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.11044","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.11044","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5987472534179688,"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"based":[4],"autonomous":[5,222],"agents":[6],"demonstrate":[7,167],"multifaceted":[8],"capabilities":[9,69],"to":[10,13,26,100,112,120],"contribute":[11],"substantially":[12],"economic":[14],"production.":[15],"However,":[16],"existing":[17],"benchmarks":[18],"remain":[19],"focused":[20],"on":[21,34,217],"single":[22],"agentic":[23,68,161,209],"capability,":[24],"failing":[25],"capture":[27],"long-horizon":[28],"real-world":[29,72],"scenarios.":[30],"Moreover,":[31],"the":[32,158,202,218,227],"reliance":[33],"human-in-the-loop":[35],"feedback":[36],"for":[37,188,198],"realistic":[38],"tasks":[39,76],"creates":[40],"a":[41,57,108,117,195],"scalability":[42],"bottleneck,":[43],"hindering":[44],"automated":[45,104],"rollout":[46],"collection":[47],"and":[48,81,95,116,123,151,224,230],"evaluation.":[49],"To":[50,102],"bridge":[51],"this":[52,213],"gap,":[53],"we":[54,106,156,225],"introduce":[55],"AgencyBench,":[56],"comprehensive":[58],"benchmark":[59,229],"derived":[60],"from":[61],"daily":[62],"AI":[63],"usage,":[64],"evaluating":[65],"6":[66],"core":[67],"across":[70,144],"32":[71],"scenarios,":[73],"comprising":[74],"138":[75],"with":[77,208],"specific":[78,152,189],"queries,":[79],"deliverables,":[80],"rubrics.":[82],"These":[83],"scenarios":[84],"require":[85],"an":[86],"average":[87],"of":[88,97,160,204,221],"90":[89],"tool":[90],"calls,":[91],"1":[92],"million":[93],"tokens,":[94],"hours":[96],"execution":[98,190],"time":[99],"resolve.":[101],"enable":[103],"evaluation,":[105],"employ":[107],"user":[109],"simulation":[110],"agent":[111],"provide":[113],"iterative":[114],"feedback,":[115],"Docker":[118],"sandbox":[119],"conduct":[121],"visual":[122],"functional":[124],"rubric-based":[125],"assessment.":[126],"Experiments":[127],"reveal":[128],"that":[129,164],"closed-source":[130],"models":[131,135,145,166,180],"significantly":[132],"outperform":[133],"open-source":[134,179],"(48.4%":[136],"vs":[137],"32.1%).":[138],"Further":[139],"analysis":[140],"reveals":[141],"significant":[142],"disparities":[143],"in":[146],"resource":[147],"efficiency,":[148],"feedback-driven":[149],"self-correction,":[150],"tool-use":[153],"preferences.":[154],"Finally,":[155],"investigate":[157],"impact":[159],"scaffolds,":[162],"observing":[163],"proprietary":[165],"superior":[168],"performance":[169,183],"within":[170],"their":[171],"native":[172],"ecosystems":[173],"(e.g.,":[174],"Claude-4.5-Opus":[175],"via":[176],"Claude-Agent-SDK),":[177],"while":[178],"exhibit":[181],"distinct":[182],"peaks,":[184],"suggesting":[185],"potential":[186],"optimization":[187],"frameworks.":[191,210],"AgencyBench":[192],"serves":[193],"as":[194],"critical":[196],"testbed":[197],"next-generation":[199],"agents,":[200,223],"highlighting":[201],"necessity":[203],"co-optimizing":[205],"model":[206],"architecture":[207],"We":[211],"believe":[212],"work":[214],"sheds":[215],"light":[216],"future":[219],"direction":[220],"release":[226],"full":[228],"evaluation":[231],"toolkit":[232],"at":[233],"https://github.com/GAIR-NLP/AgencyBench.":[234]},"counts_by_year":[],"updated_date":"2026-04-16T08:26:57.006410","created_date":"2026-01-20T00:00:00"}
