{"id":"https://openalex.org/W7134224958","doi":"https://doi.org/10.48550/arxiv.2603.05910","title":"The World Won't Stay Still: Programmable Evolution for Agent Benchmarks","display_name":"The World Won't Stay Still: Programmable Evolution for Agent Benchmarks","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7134224958","doi":"https://doi.org/10.48550/arxiv.2603.05910"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.05910","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128604618","display_name":"Guangrui Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Guangrui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128391182","display_name":"Yaochen Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Yaochen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128592282","display_name":"Yi Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128567599","display_name":"Ziwei Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Ziwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128483430","display_name":"Xingyuan Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Xingyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053505580","display_name":"Tianqi Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Tianqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128408756","display_name":"Jason Choi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choi, Jason","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023188897","display_name":"Michael J. Morais","orcid":"https://orcid.org/0000-0003-3739-2805"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Morais, Michael J.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013086444","display_name":"Binit Kumar Jha","orcid":"https://orcid.org/0000-0002-0844-2538"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jha, Binit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101929005","display_name":"Shaunak Mishra","orcid":"https://orcid.org/0000-0003-3373-6172"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mishra, Shaunak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001190649","display_name":"Bingrou Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Bingrou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128380183","display_name":"Chen Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110641878","display_name":"Monica Xiao Cheng","orcid":"https://orcid.org/0009-0003-8594-7104"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Monica Xiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128614819","display_name":"Dawn Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Dawn","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5128604618"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.12150000035762787,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.12150000035762787,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.11089999973773956,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.10559999942779541,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.642799973487854},{"id":"https://openalex.org/keywords/adaptability","display_name":"Adaptability","score":0.5730999708175659},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.5095000267028809},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5026000142097473},{"id":"https://openalex.org/keywords/evolvability","display_name":"Evolvability","score":0.46459999680519104},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4438999891281128},{"id":"https://openalex.org/keywords/evolutionary-algorithm","display_name":"Evolutionary algorithm","score":0.3409999907016754}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.769599974155426},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.642799973487854},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.5730999708175659},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.5095000267028809},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5026000142097473},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.4706999957561493},{"id":"https://openalex.org/C176147130","wikidata":"https://www.wikidata.org/wiki/Q909622","display_name":"Evolvability","level":2,"score":0.46459999680519104},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4438999891281128},{"id":"https://openalex.org/C159149176","wikidata":"https://www.wikidata.org/wiki/Q14489129","display_name":"Evolutionary algorithm","level":2,"score":0.3409999907016754},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3368000090122223},{"id":"https://openalex.org/C105902424","wikidata":"https://www.wikidata.org/wiki/Q1197129","display_name":"Evolutionary computation","level":2,"score":0.3246000111103058},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.30730000138282776},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.30160000920295715},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.29190000891685486},{"id":"https://openalex.org/C176225458","wikidata":"https://www.wikidata.org/wiki/Q595971","display_name":"Graph database","level":3,"score":0.2856000065803528},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.2639999985694885}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.05910","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.05910","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.05910","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.05910","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.5075370669364929,"display_name":"Life in Land","id":"https://metadata.un.org/sdg/15"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"LLM-powered":[0],"tool-calling":[1,71,160],"agents":[2,41,189],"fulfill":[3],"user":[4],"requests":[5],"by":[6],"interacting":[7],"with":[8,30],"environments,":[9],"querying":[10],"data,":[11,100],"and":[12,33,102,124,143,153,163,172],"invoking":[13],"tools":[14],"in":[15,158,166],"a":[16,67,76,87,92,182],"multi-turn":[17],"process.":[18],"Yet,":[19],"most":[20],"existing":[21],"benchmarks":[22],"evaluate":[23],"these":[24],"systems":[25],"under":[26,191],"static":[27],"environment":[28,56,64,81,98,193],"interfaces,":[29],"fixed":[31],"schemas":[32],"toolsets,":[34],"making":[35],"it":[36],"difficult":[37],"to":[38,185],"assess":[39],"how":[40,187],"behave":[42,190],"as":[43,66,114,181],"environments":[44,138],"evolves":[45],"--":[46],"when":[47],"capabilities":[48,111],"are":[49,112],"added,":[50],"reorganized,":[51],"or":[52,109],"deprecated":[53],"across":[54,121],"successive":[55],"versions.":[57],"In":[58],"this":[59,105],"paper,":[60],"we":[61,176],"study":[62,186],"structured":[63,192],"evolution":[65,82],"benchmark-construction":[68],"problem":[69],"for":[70],"agents.":[72],"We":[73,155],"propose":[74],"ProEvolve,":[75],"graph-based":[77],"framework":[78],"that":[79,117],"makes":[80],"programmable.":[83],"At":[84],"its":[85],"core,":[86],"typed":[88],"relational":[89],"graph":[90,115,141],"provides":[91],"unified,":[93],"explicit":[94,140],"representation":[95],"of":[96,135,147,168],"the":[97,178],"-":[99],"tools,":[101,122],"schema.":[103],"Under":[104],"formalism,":[106],"adding,":[107],"removing,":[108],"modifying":[110],"expressed":[113],"transformations":[116],"coherently":[118],"propagate":[119],"updates":[120],"schemas,":[123],"data":[125],"access.":[126],"Building":[127],"on":[128],"this,":[129],"ProEvolve":[130,157],"supports":[131],"(1)":[132],"automatic":[133],"generation":[134],"evolved":[136],"executable":[137],"through":[139],"transformations,":[142],"(2)":[144],"graph-grounded":[145],"construction":[146],"task":[148],"sandboxes":[149],"via":[150],"subgraph":[151],"sampling":[152],"instantiation.":[154],"validate":[156],"two":[159],"domains,":[161],"e-commerce":[162],"airline":[164],"booking,":[165],"terms":[167],"quality,":[169],"implementation":[170],"validity,":[171],"failure":[173],"modes.":[174],"Finally,":[175],"use":[177],"generated":[179],"benchmark":[180],"downstream":[183],"diagnostic":[184],"representative":[188],"evolution.":[194]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2026-03-10T00:00:00"}
