{"id":"https://openalex.org/W7138308002","doi":"https://doi.org/10.48550/arxiv.2603.13594","title":"EnterpriseOps-Gym: Environments and Evaluations for Stateful Agentic Planning and Tool Use in Enterprise Settings","display_name":"EnterpriseOps-Gym: Environments and Evaluations for Stateful Agentic Planning and Tool Use in Enterprise Settings","publication_year":2026,"publication_date":"2026-03-13","ids":{"openalex":"https://openalex.org/W7138308002","doi":"https://doi.org/10.48550/arxiv.2603.13594"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.13594","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13594","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.13594","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129682158","display_name":"Shiva Krishna Reddy Malay","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Malay, Shiva Krishna Reddy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003441670","display_name":"Shravan Nayak","orcid":"https://orcid.org/0000-0002-5298-7121"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nayak, Shravan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129704597","display_name":"Jishnu Sethumadhavan Nair","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nair, Jishnu Sethumadhavan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5117865086","display_name":"Sagar Davasam","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Davasam, Sagar","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129659805","display_name":"Aman Tiwari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tiwari, Aman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016214507","display_name":"Sathwik Tejaswi Madhusudhan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Madhusudhan, Sathwik Tejaswi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080686450","display_name":"Sridhar Krishna Nemala","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nemala, Sridhar Krishna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129704080","display_name":"Srinivas Sunkara","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sunkara, Srinivas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5041629023","display_name":"Sai Rajeswar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rajeswar, Sai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.21150000393390656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.21150000393390656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.12610000371932983,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.08950000256299973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.5849000215530396},{"id":"https://openalex.org/keywords/oracle","display_name":"Oracle","score":0.5501999855041504},{"id":"https://openalex.org/keywords/stateful-firewall","display_name":"Stateful firewall","score":0.5339999794960022},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.47699999809265137},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.4756999909877777},{"id":"https://openalex.org/keywords/sandbox","display_name":"Sandbox (software development)","score":0.38100001215934753},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.3628999888896942},{"id":"https://openalex.org/keywords/automated-planning-and-scheduling","display_name":"Automated planning and scheduling","score":0.35040000081062317}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6470999717712402},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.5849000215530396},{"id":"https://openalex.org/C55166926","wikidata":"https://www.wikidata.org/wiki/Q2892946","display_name":"Oracle","level":2,"score":0.5501999855041504},{"id":"https://openalex.org/C22927095","wikidata":"https://www.wikidata.org/wiki/Q1784206","display_name":"Stateful firewall","level":3,"score":0.5339999794960022},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.47699999809265137},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.4756999909877777},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.4408999979496002},{"id":"https://openalex.org/C167981075","wikidata":"https://www.wikidata.org/wiki/Q2667186","display_name":"Sandbox (software development)","level":2,"score":0.38100001215934753},{"id":"https://openalex.org/C195094911","wikidata":"https://www.wikidata.org/wiki/Q14167904","display_name":"Process management","level":1,"score":0.3741999864578247},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3628999888896942},{"id":"https://openalex.org/C114073186","wikidata":"https://www.wikidata.org/wiki/Q2631895","display_name":"Automated planning and scheduling","level":2,"score":0.35040000081062317},{"id":"https://openalex.org/C67571701","wikidata":"https://www.wikidata.org/wiki/Q1318054","display_name":"Enterprise system","level":2,"score":0.33390000462532043},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.33379998803138733},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.32260000705718994},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.31610000133514404},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3050999939441681},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.304500013589859},{"id":"https://openalex.org/C51929080","wikidata":"https://www.wikidata.org/wiki/Q2425187","display_name":"Codebase","level":3,"score":0.301800012588501},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2962999939918518},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C2776889888","wikidata":"https://www.wikidata.org/wiki/Q1135789","display_name":"Unintended consequences","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2818000018596649},{"id":"https://openalex.org/C47487241","wikidata":"https://www.wikidata.org/wiki/Q5227230","display_name":"Data access","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.2802000045776367},{"id":"https://openalex.org/C2777615720","wikidata":"https://www.wikidata.org/wiki/Q11888847","display_name":"Prioritization","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.27230000495910645},{"id":"https://openalex.org/C185765463","wikidata":"https://www.wikidata.org/wiki/Q1318054","display_name":"Enterprise software","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C146342590","wikidata":"https://www.wikidata.org/wiki/Q11606385","display_name":"Enterprise systems engineering","level":4,"score":0.26440000534057617},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2572000026702881}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.13594","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13594","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.13594","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13594","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,113],"are":[3,92,176],"shifting":[4],"from":[5],"passive":[6],"information":[7],"providers":[8],"to":[9,31,61,83,154,163,191],"active":[10],"agents":[11,91,151,175],"intended":[12],"for":[13,41,180],"complex":[14],"workflows.":[15,200],"However,":[16],"their":[17],"deployment":[18],"as":[19,146],"reliable":[20],"AI":[21],"workers":[22],"in":[23,65,117,198],"enterprise":[24,67,182],"is":[25],"stalled":[26],"by":[27,139],"benchmarks":[28],"that":[29,132,173],"fail":[30,153],"capture":[32],"the":[33,39,120,147,193],"intricacies":[34],"of":[35,110,195],"professional":[36,199],"environments,":[37],"specifically,":[38],"need":[40],"long-horizon":[42],"planning":[43,64,197],"amidst":[44],"persistent":[45],"state":[46],"changes":[47],"and":[48,79,106,165],"strict":[49],"access":[50],"protocols.":[51],"In":[52],"this":[53,89],"work,":[54],"we":[55],"introduce":[56],"EnterpriseOps-Gym,":[57],"a":[58,72,188],"benchmark":[59],"designed":[60],"evaluate":[62],"agentic":[63,196],"realistic":[66],"settings.":[68],"Specifically,":[69],"EnterpriseOps-Gym":[70,186],"features":[71],"containerized":[73],"sandbox":[74],"with":[75],"164":[76],"database":[77],"tables":[78],"512":[80],"functional":[81],"tools":[82],"mimic":[84],"real-world":[85],"search":[86],"friction.":[87],"Within":[88],"environment,":[90],"evaluated":[93],"on":[94],"1,150":[95],"expert-curated":[96],"tasks":[97,157],"across":[98],"eight":[99],"mission-critical":[100],"verticals":[101],"(including":[102],"Customer":[103],"Service,":[104],"HR,":[105],"IT).":[107],"Our":[108,170],"evaluation":[109],"14":[111],"frontier":[112],"reveals":[114],"critical":[115],"limitations":[116],"state-of-the-art":[118],"models:":[119],"top-performing":[121],"Claude":[122],"Opus":[123],"4.5":[124],"achieves":[125,160],"only":[126],"37.4%":[127],"success.":[128],"Further":[129],"analysis":[130],"shows":[131],"providing":[133],"oracle":[134],"human":[135],"plans":[136],"improves":[137],"performance":[138],"14-35":[140],"percentage":[141],"points,":[142],"pinpointing":[143],"strategic":[144],"reasoning":[145],"primary":[148],"bottleneck.":[149],"Additionally,":[150],"frequently":[152],"refuse":[155],"infeasible":[156],"(best":[158],"model":[159],"53.9%),":[161],"leading":[162],"unintended":[164],"potentially":[166],"harmful":[167],"side":[168],"effects.":[169],"findings":[171],"underscore":[172],"current":[174],"not":[177],"yet":[178],"ready":[179],"autonomous":[181],"deployment.":[183],"More":[184],"broadly,":[185],"provides":[187],"concrete":[189],"testbed":[190],"advance":[192],"robustness":[194]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-18T00:00:00"}
