{"id":"https://openalex.org/W7162634564","doi":"https://doi.org/10.48550/arxiv.2605.27492","title":"Benchmarks are Not Enough: RAMP for Runtime Assessing of Agentic Models in Production Systems","display_name":"Benchmarks are Not Enough: RAMP for Runtime Assessing of Agentic Models in Production Systems","publication_year":2026,"publication_date":"2026-05-26","ids":{"openalex":"https://openalex.org/W7162634564","doi":"https://doi.org/10.48550/arxiv.2605.27492"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.27492","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27492","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.27492","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5044428220","display_name":"Yipeng Ouyang","orcid":"https://orcid.org/0009-0008-3508-6592"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ouyang, Yipeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137192385","display_name":"Xin Huang","orcid":"https://orcid.org/0000-0002-0998-7935"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137192084","display_name":"Bingjie Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Bingjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137267303","display_name":"Zhongchun Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Zhongchun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135438336","display_name":"Yuhao Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Yuhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137306080","display_name":"Xianwei Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xianwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10639","display_name":"Advanced Software Engineering Methodologies","score":0.3903999924659729,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10639","display_name":"Advanced Software Engineering Methodologies","score":0.3903999924659729,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.2644999921321869,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.04610000178217888,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/toolchain","display_name":"Toolchain","score":0.6353999972343445},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.6298999786376953},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.44339999556541443},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.44339999556541443},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.41659998893737793},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.40119999647140503},{"id":"https://openalex.org/keywords/symbolic-execution","display_name":"Symbolic execution","score":0.4000999927520752},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.3952000141143799},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.39169999957084656},{"id":"https://openalex.org/keywords/iterative-and-incremental-development","display_name":"Iterative and incremental development","score":0.3847000002861023}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7998999953269958},{"id":"https://openalex.org/C2777062904","wikidata":"https://www.wikidata.org/wiki/Q545406","display_name":"Toolchain","level":3,"score":0.6353999972343445},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.6298999786376953},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.45570001006126404},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.44339999556541443},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.44339999556541443},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.41659998893737793},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.40119999647140503},{"id":"https://openalex.org/C2779639559","wikidata":"https://www.wikidata.org/wiki/Q7661178","display_name":"Symbolic execution","level":3,"score":0.4000999927520752},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.396699994802475},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.3952000141143799},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.39169999957084656},{"id":"https://openalex.org/C143587482","wikidata":"https://www.wikidata.org/wiki/Q1543216","display_name":"Iterative and incremental development","level":2,"score":0.3847000002861023},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.357699990272522},{"id":"https://openalex.org/C2776834041","wikidata":"https://www.wikidata.org/wiki/Q25346349","display_name":"Execution model","level":2,"score":0.33480000495910645},{"id":"https://openalex.org/C202973057","wikidata":"https://www.wikidata.org/wiki/Q7380130","display_name":"Runtime verification","level":3,"score":0.32760000228881836},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.313400000333786},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3118000030517578},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.3091000020503998},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.30869999527931213},{"id":"https://openalex.org/C162262903","wikidata":"https://www.wikidata.org/wiki/Q343527","display_name":"Allocator","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C2780870223","wikidata":"https://www.wikidata.org/wiki/Q1004415","display_name":"Runtime system","level":2,"score":0.30000001192092896},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.2996000051498413},{"id":"https://openalex.org/C199168358","wikidata":"https://www.wikidata.org/wiki/Q3367000","display_name":"Orchestration","level":3,"score":0.2994999885559082},{"id":"https://openalex.org/C198140048","wikidata":"https://www.wikidata.org/wiki/Q10859422","display_name":"Software versioning","level":3,"score":0.2964000105857849},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.2948000133037567},{"id":"https://openalex.org/C2777256151","wikidata":"https://www.wikidata.org/wiki/Q7897273","display_name":"Unpacking","level":2,"score":0.28929999470710754},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C149091818","wikidata":"https://www.wikidata.org/wiki/Q2429814","display_name":"Software system","level":3,"score":0.2831999957561493},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C117447612","wikidata":"https://www.wikidata.org/wiki/Q1412670","display_name":"Software quality","level":4,"score":0.2703000009059906},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.2687999904155731},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2574999928474426},{"id":"https://openalex.org/C126433048","wikidata":"https://www.wikidata.org/wiki/Q7921324","display_name":"Verification and validation of computer simulation models","level":2,"score":0.25609999895095825},{"id":"https://openalex.org/C97686452","wikidata":"https://www.wikidata.org/wiki/Q7604153","display_name":"Static analysis","level":2,"score":0.2531999945640564},{"id":"https://openalex.org/C119701452","wikidata":"https://www.wikidata.org/wiki/Q5165881","display_name":"Control reconfiguration","level":2,"score":0.2524000108242035},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.25220000743865967},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.2515000104904175},{"id":"https://openalex.org/C168065819","wikidata":"https://www.wikidata.org/wiki/Q845566","display_name":"Debugging","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.27492","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27492","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.27492","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27492","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.6244872808456421}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"LLM":[0],"agents":[1],"are":[2],"rapidly":[3],"evolving":[4],"from":[5,166],"coding":[6],"assistants":[7],"into":[8],"autonomous":[9],"software":[10,73],"engineering":[11,74],"systems.":[12],"However,":[13],"existing":[14],"evaluation":[15,221],"methodologies":[16],"remain":[17],"largely":[18,151],"centered":[19],"on":[20],"static,":[21],"isolated,":[22],"and":[23,59,92,103,133,144,196,225],"short-horizon":[24],"benchmarks":[25],"that":[26,128,149],"fail":[27],"to":[28,153,172,206],"capture":[29],"the":[30,78,169,176,182,187],"dynamic":[31],"complexity":[32],"of":[33,181,209],"real-world":[34],"production":[35],"workflows.":[36],"As":[37],"a":[38,67,84,109],"result,":[39],"benchmark":[40],"performance":[41],"may":[42],"poorly":[43],"reflect":[44],"practical":[45],"capability":[46,147],"under":[47,117],"realistic":[48,97],"runtime":[49,86,138],"environments":[50],"involving":[51],"long":[52],"execution":[53,93,115],"chains,":[54],"tool":[55],"interactions,":[56,106],"dependency":[57],"management,":[58],"iterative":[60],"feedback":[61],"loops.":[62],"We":[63,136],"thus":[64],"present":[65],"RAMP,":[66],"production-grounded":[68,226],"infrastructure":[69],"for":[70,113],"assessing":[71],"long-horizon":[72],"agents.":[75],"Built":[76],"upon":[77],"YatCC":[79],"integrated":[80],"platform,":[81],"RAMP":[82,95,217],"provides":[83],"unified":[85],"assessment":[87],"architecture":[88],"through":[89],"standardized":[90],"orchestration":[91],"interfaces.":[94],"introduces":[96],"compiler-construction":[98],"workloads":[99],"with":[100,108,200],"serial":[101,163],"dependencies":[102],"complex":[104],"toolchain":[105],"together":[107],"staged":[110],"recovery":[111],"mechanism":[112],"analyzing":[114],"behavior":[116],"partial":[118],"workflow":[119],"failure.":[120],"The":[121],"framework":[122],"further":[123],"incorporates":[124],"utility-oriented":[125],"multi-dimensional":[126],"metrics":[127],"jointly":[129],"evaluate":[130],"outcome":[131],"quality":[132],"process":[134],"efficiency.":[135],"conduct":[137],"assessments":[139],"across":[140,162],"15":[141],"mainstream":[142],"models":[143,184],"observe":[145],"substantial":[146],"degradation":[148],"remains":[150],"invisible":[152],"conventional":[154],"isolated":[155],"benchmarks.":[156],"Task":[157],"completion":[158],"rates":[159],"progressively":[160],"collapse":[161],"workflows,":[164],"dropping":[165],"100%":[167],"in":[168,175],"initial":[170],"stage":[171],"only":[173],"20%":[174],"final":[177],"stage,":[178],"while":[179],"none":[180],"evaluated":[183],"successfully":[185],"completes":[186],"entire":[188],"pipeline.":[189],"Runtime":[190],"analysis":[191],"reveals":[192],"systematic":[193],"failure":[194],"propagation":[195],"significant":[197],"resource":[198],"inefficiencies,":[199],"computational":[201],"costs":[202],"differing":[203],"by":[204],"up":[205],"three":[207],"orders":[208],"magnitude":[210],"among":[211],"comparable":[212],"models.":[213],"These":[214],"findings":[215],"suggest":[216],"advances":[218],"agentic":[219],"model":[220],"toward":[222],"continuous,":[223],"runtime-observable,":[224],"assessment.":[227]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-29T00:00:00"}
