{"id":"https://openalex.org/W7159594753","doi":"https://doi.org/10.48550/arxiv.2604.28139","title":"Claw-Eval-Live: A Live Agent Benchmark for Evolving Real-World Workflows","display_name":"Claw-Eval-Live: A Live Agent Benchmark for Evolving Real-World Workflows","publication_year":2026,"publication_date":"2026-04-30","ids":{"openalex":"https://openalex.org/W7159594753","doi":"https://doi.org/10.48550/arxiv.2604.28139"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.28139","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.28139","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.28139","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134960937","display_name":"Chenxin Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chenxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134973090","display_name":"Zhengyang Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Zhengyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134953250","display_name":"Huangxin Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Mingxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134984805","display_name":"Yunlong Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Yunlong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067715252","display_name":"Shijue Huang","orcid":"https://orcid.org/0000-0001-9443-7948"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Shijue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134949401","display_name":"Shengyuan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Shengyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134947673","display_name":"Bowen Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Bowen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134991521","display_name":"Rang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Rang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134956720","display_name":"Lei Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134935087","display_name":"Benyou Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Benyou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134980752","display_name":"Yixuan Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Yixuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10703","display_name":"Business Process Modeling and Analysis","score":0.4016000032424927,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10703","display_name":"Business Process Modeling and Analysis","score":0.4016000032424927,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.15000000596046448,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.0957999974489212,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.8090999722480774},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6456000208854675},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6247000098228455},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.3758000135421753},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.37310001254081726},{"id":"https://openalex.org/keywords/business-process","display_name":"Business process","score":0.3495999872684479},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.34549999237060547},{"id":"https://openalex.org/keywords/automation","display_name":"Automation","score":0.34360000491142273}],"concepts":[{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.8090999722480774},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7631000280380249},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6456000208854675},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6247000098228455},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.3758000135421753},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.37470000982284546},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.37310001254081726},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.3625999987125397},{"id":"https://openalex.org/C85345410","wikidata":"https://www.wikidata.org/wiki/Q851587","display_name":"Business process","level":3,"score":0.3495999872684479},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.34549999237060547},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.34360000491142273},{"id":"https://openalex.org/C2780378061","wikidata":"https://www.wikidata.org/wiki/Q25351891","display_name":"Service (business)","level":2,"score":0.3319000005722046},{"id":"https://openalex.org/C58581272","wikidata":"https://www.wikidata.org/wiki/Q12741163","display_name":"Workspace","level":3,"score":0.32179999351501465},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.31130000948905945},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.2969000041484833},{"id":"https://openalex.org/C2779886121","wikidata":"https://www.wikidata.org/wiki/Q288682","display_name":"XACML","level":3,"score":0.28769999742507935},{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.2874000072479248},{"id":"https://openalex.org/C46934059","wikidata":"https://www.wikidata.org/wiki/Q61515","display_name":"Outsourcing","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C13687954","wikidata":"https://www.wikidata.org/wiki/Q4826847","display_name":"Autonomous agent","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C80958533","wikidata":"https://www.wikidata.org/wiki/Q1047174","display_name":"Audit trail","level":3,"score":0.25850000977516174},{"id":"https://openalex.org/C2775851571","wikidata":"https://www.wikidata.org/wiki/Q6045205","display_name":"Interaction protocol","level":3,"score":0.2563000023365021},{"id":"https://openalex.org/C94966114","wikidata":"https://www.wikidata.org/wiki/Q29256","display_name":"Black box","level":2,"score":0.2533999979496002}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.28139","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.28139","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.28139","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.28139","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"LLM":[0,133],"agents":[1,41,61],"are":[2,187],"expected":[3],"to":[4,39],"complete":[5],"end-to-end":[6],"units":[7],"of":[8,179,237],"work":[9],"across":[10,69],"software":[11],"tools,":[12],"business":[13,146,200],"services,":[14,106],"and":[15,30,98,108,120,131,148,152,181,192,198,205,229,252],"local":[16,149,206],"workspaces.":[17],"Yet":[18],"many":[19],"agent":[20,255],"benchmarks":[21],"freeze":[22],"a":[23,49,56,64,76,158,234],"curated":[24],"task":[25,50,190],"set":[26],"at":[27],"release":[28,79,82,140],"time":[29],"grade":[31],"mainly":[32],"the":[33,95,173],"final":[34],"response,":[35],"making":[36],"it":[37],"difficult":[38],"evaluate":[40],"against":[42],"evolving":[43],"workflow":[44,60,167],"demand":[45,251],"or":[46],"verify":[47],"whether":[48],"was":[51],"executed.":[52],"We":[53],"introduce":[54],"Claw-Eval-Live,":[55],"live":[57],"benchmark":[58],"for":[59,136],"that":[62,165,241],"separates":[63],"refreshable":[65],"signal":[66],"layer,":[67],"updated":[68],"releases":[70],"from":[71,75,85,171],"public":[72,86,160],"workflow-demand":[73,87],"signals,":[74,88],"reproducible,":[77],"time-stamped":[78],"snapshot.":[80],"Each":[81],"is":[83,129,216],"constructed":[84],"with":[89,103,195,220],"ClawHub":[90],"Top-500":[91],"skills":[92],"used":[93],"in":[94,226,233,248,253],"current":[96],"release,":[97],"materialized":[99],"as":[100,202],"controlled":[101,145],"tasks":[102,143,180],"fixed":[104],"fixtures,":[105],"workspaces,":[107],"graders.":[109],"For":[110],"grading,":[111],"Claw-Eval-Live":[112,239],"records":[113],"execution":[114,193],"traces,":[115],"audit":[116],"logs,":[117],"service":[118],"state,":[119],"post-run":[121],"workspace":[122,150,207],"artifacts,":[123],"using":[124],"deterministic":[125],"checks":[126],"when":[127],"evidence":[128],"sufficient":[130],"structured":[132,188],"judging":[134],"only":[135,177],"semantic":[137],"dimensions.":[138],"The":[139],"contains":[141],"105":[142],"spanning":[144],"services":[147],"repair,":[151],"evaluates":[153],"13":[154],"frontier":[155],"models":[156,219],"under":[157],"shared":[159],"pass":[161,222],"rule.":[162],"Experiments":[163],"reveal":[164],"reliable":[166],"automation":[168],"remains":[169],"far":[170],"solved:":[172],"leading":[174],"model":[175,183],"passes":[176],"66.7%":[178],"no":[182],"reaches":[184],"70%.":[185],"Failures":[186],"by":[189],"family":[191],"surface,":[194],"HR,":[196],"management,":[197],"multi-system":[199],"workflows":[201],"persistent":[203],"bottlenecks":[204],"repair":[208],"comparatively":[209],"easier":[210],"but":[211],"unsaturated.":[212],"Leaderboard":[213],"rank":[214],"alone":[215],"insufficient":[217],"because":[218],"similar":[221],"rates":[223],"can":[224],"diverge":[225],"overall":[227],"completion,":[228],"task-level":[230],"discrimination":[231],"concentrates":[232],"middle":[235],"band":[236],"tasks.":[238],"suggests":[240],"workflow-agent":[242],"evaluation":[243],"should":[244],"be":[245],"grounded":[246],"twice,":[247],"fresh":[249],"external":[250],"verifiable":[254],"action.":[256]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-02T00:00:00"}
