{"id":"https://openalex.org/W7151965843","doi":"https://doi.org/10.48550/arxiv.2604.06111","title":"AgentCE-Bench: Agent Configurable Evaluation with Scalable Horizons and Controllable Difficulty under Lightweight Environments","display_name":"AgentCE-Bench: Agent Configurable Evaluation with Scalable Horizons and Controllable Difficulty under Lightweight Environments","publication_year":2026,"publication_date":"2026-04-07","ids":{"openalex":"https://openalex.org/W7151965843","doi":"https://doi.org/10.48550/arxiv.2604.06111"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.06111","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06111","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.06111","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133159543","display_name":"Wang Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yang, Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111262990","display_name":"Chaoda Song","orcid":"https://orcid.org/0009-0001-5392-6304"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Chaoda","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133211176","display_name":"Xinpeng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xinpeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133178883","display_name":"Debargha Ganguly","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ganguly, Debargha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133219566","display_name":"Chuang Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Chuang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122963420","display_name":"Shouren Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shouren","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123475562","display_name":"Zhihao Dou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dou, Zhihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101951953","display_name":"Yuli Zhou","orcid":"https://orcid.org/0000-0001-7206-9697"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yuli","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133203382","display_name":"Vipin Chaudhary","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chaudhary, Vipin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133197226","display_name":"Xiaotian Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Xiaotian","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5133159543"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.2775000035762787,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.2775000035762787,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1949000060558319,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.14640000462532043,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7576000094413757},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.6919000148773193},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6680999994277954},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6158999800682068},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5853999853134155},{"id":"https://openalex.org/keywords/schedule","display_name":"Schedule","score":0.5601000189781189},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.4577000141143799},{"id":"https://openalex.org/keywords/decoy","display_name":"Decoy","score":0.41339999437332153}],"concepts":[{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7576000094413757},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7081999778747559},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.6919000148773193},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6680999994277954},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6158999800682068},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.6000999808311462},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5853999853134155},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.5601000189781189},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.4577000141143799},{"id":"https://openalex.org/C2779179475","wikidata":"https://www.wikidata.org/wiki/Q3545649","display_name":"Decoy","level":3,"score":0.41339999437332153},{"id":"https://openalex.org/C2780416260","wikidata":"https://www.wikidata.org/wiki/Q2063","display_name":"JSON","level":2,"score":0.39559999108314514},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.3935000002384186},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3564000129699707},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.34389999508857727},{"id":"https://openalex.org/C159176650","wikidata":"https://www.wikidata.org/wiki/Q43261","display_name":"Horizon","level":2,"score":0.3377000093460083},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.33160001039505005},{"id":"https://openalex.org/C28761237","wikidata":"https://www.wikidata.org/wiki/Q7805321","display_name":"Time horizon","level":2,"score":0.30059999227523804},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.29409998655319214},{"id":"https://openalex.org/C134448949","wikidata":"https://www.wikidata.org/wiki/Q1384274","display_name":"Expediting","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2556000053882599}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.06111","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06111","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.06111","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06111","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.6746672987937927,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Existing":[0],"Agent":[1],"benchmarks":[2],"suffer":[3],"from":[4],"two":[5,71],"critical":[6],"limitations:":[7],"high":[8],"environment":[9],"interaction":[10],"overhead":[11,119],"(up":[12],"to":[13,57],"41\\%":[14],"of":[15,80,97,162,184],"total":[16],"evaluation":[17,124,183],"time)":[18],"and":[19,23,62,84,120,134,142,144,151,165,175,181],"imbalanced":[20],"task":[21,140],"horizon":[22,141],"difficulty":[24],"distributions":[25],"that":[26,93,132,145,177],"make":[27],"aggregate":[28],"scores":[29],"unreliable.":[30],"To":[31],"address":[32],"these":[33],"issues,":[34],"we":[35],"propose":[36],"AgentCE-Bench":[37,146,178],"built":[38],"around":[39],"a":[40,52,89,113],"unified":[41],"grid-based":[42],"planning":[43],"task,":[44],"where":[45],"agents":[46],"must":[47],"fill":[48],"hidden":[49,81],"slots":[50,82],"in":[51],"partially":[53],"completed":[54],"schedule":[55],"subject":[56],"both":[58],"local":[59],"slot":[60],"constraints":[61],"global":[63],"constraints.":[64],"Our":[65],"benchmark":[66],"offers":[67],"fine-grained":[68],"control":[69,138],"through":[70],"orthogonal":[72],"axes:":[73],"\\textbf{Scalable":[74],"Horizons},":[75],"controlled":[76],"by":[77,88],"the":[78,95],"number":[79,96],"$H$,":[83],"\\textbf{Controllable":[85],"Difficulty},":[86],"governed":[87],"decoy":[90,100],"budget":[91],"$B$":[92,135],"determines":[94],"globally":[98],"misleading":[99],"candidates.":[101],"Crucially,":[102],"all":[103],"tool":[104],"calls":[105],"are":[106],"resolved":[107],"via":[108],"static":[109],"JSON":[110],"files":[111],"under":[112],"\\textbf{Lightweight":[114],"Environment}":[115],"design,":[116],"eliminating":[117],"setup":[118],"enabling":[121],"fast,":[122],"reproducible":[123],"suitable":[125],"for":[126],"training-time":[127],"validation.":[128],"We":[129,154],"first":[130],"validate":[131],"$H$":[133],"provide":[136],"reliable":[137],"over":[139,167],"difficulty,":[143],"exhibits":[147],"strong":[148],"domain":[149],"consistency":[150],"model":[152],"discriminability.":[153],"then":[155],"conduct":[156],"comprehensive":[157],"experiments":[158],"across":[159],"13":[160],"models":[161],"diverse":[163],"sizes":[164],"families":[166],"6":[168],"domains,":[169],"revealing":[170],"significant":[171],"cross-model":[172],"performance":[173],"variation":[174],"confirming":[176],"provides":[179],"interpretable":[180],"controllable":[182],"agent":[185],"reasoning.":[186]},"counts_by_year":[],"updated_date":"2026-04-14T06:02:45.956762","created_date":"2026-04-09T00:00:00"}
