{"id":"https://openalex.org/W7151843496","doi":"https://doi.org/10.48550/arxiv.2604.06132","title":"Claw-Eval: Towards Trustworthy Evaluation of Autonomous Agents","display_name":"Claw-Eval: Towards Trustworthy Evaluation of Autonomous Agents","publication_year":2026,"publication_date":"2026-04-07","ids":{"openalex":"https://openalex.org/W7151843496","doi":"https://doi.org/10.48550/arxiv.2604.06132"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.06132","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06132","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.06132","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5068180403","display_name":"B. Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ye, Bowen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133229738","display_name":"Rang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Rang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133234253","display_name":"Qibin Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Qibin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133217244","display_name":"Yuanxin Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yuanxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133224355","display_name":"Linli Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Linli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049372471","display_name":"H. Lv","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lv, Hanglong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123886423","display_name":"Zhihui Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Zhihui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055648957","display_name":"Chenxin An","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"An, Chenxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133176513","display_name":"Lei Li","orcid":"https://orcid.org/0009-0007-5458-766X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133166540","display_name":"Lingpeng Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kong, Lingpeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034064175","display_name":"Qi Liu","orcid":"https://orcid.org/0000-0002-9614-6228"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Qi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133216197","display_name":"Zhifang Sui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sui, Zhifang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133206894","display_name":"Tong Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Tong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5068180403"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.29019999504089355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.29019999504089355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.11400000005960464,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.05939999967813492,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.590499997138977},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5843999981880188},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.4422999918460846},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.4309000074863434},{"id":"https://openalex.org/keywords/trustworthiness","display_name":"Trustworthiness","score":0.40610000491142273},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.4059999883174896},{"id":"https://openalex.org/keywords/grading","display_name":"Grading (engineering)","score":0.3720000088214874},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.3684000074863434}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7569000124931335},{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.590499997138977},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5843999981880188},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.4422999918460846},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.4309000074863434},{"id":"https://openalex.org/C153701036","wikidata":"https://www.wikidata.org/wiki/Q659974","display_name":"Trustworthiness","level":2,"score":0.40610000491142273},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.4059999883174896},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3822000026702881},{"id":"https://openalex.org/C2777286243","wikidata":"https://www.wikidata.org/wiki/Q5591926","display_name":"Grading (engineering)","level":2,"score":0.3720000088214874},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.3684000074863434},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3610000014305115},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.31790000200271606},{"id":"https://openalex.org/C5894958","wikidata":"https://www.wikidata.org/wiki/Q2297769","display_name":"Software agent","level":2,"score":0.3093000054359436},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.30079999566078186},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2922999858856201},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.27549999952316284},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2709999978542328},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.260699987411499},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C2776654903","wikidata":"https://www.wikidata.org/wiki/Q2601463","display_name":"SAFER","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C15569618","wikidata":"https://www.wikidata.org/wiki/Q3561421","display_name":"Liveness","level":2,"score":0.2538999915122986}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.06132","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06132","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.06132","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06132","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"No poverty","score":0.5018078088760376,"id":"https://metadata.un.org/sdg/1"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,122],"are":[3,20,197],"increasingly":[4],"deployed":[5],"as":[6],"autonomous":[7],"agents":[8,195],"for":[9,193],"multi-step":[10],"workflows":[11],"in":[12],"real-world":[13],"software":[14],"environments.":[15],"However,":[16],"existing":[17],"agent":[18],"benchmarks":[19],"limited":[21],"by":[22,142,161],"trajectory-opaque":[23],"grading,":[24,72],"underspecified":[25],"safety":[26,134],"and":[27,30,35,63,65,86,100,106,136,180],"robustness":[28,139],"evaluation,":[29],"narrow":[31],"coverage":[32,187],"of":[33,133,138],"modalities":[34],"interaction":[36],"paradigms.":[37],"We":[38],"introduce":[39],"Claw-Eval,":[40],"an":[41],"end-to-end":[42],"evaluation":[43,127,186],"suite":[44],"addressing":[45],"these":[46],"gaps":[47],"with":[48,102,151,173],"300":[49],"human-verified":[50],"tasks":[51],"spanning":[52],"9":[53],"categories":[54],"across":[55,108,177],"three":[56,78,109],"groups:":[57],"general":[58],"service":[59],"orchestration,":[60],"multimodal":[61],"perception":[62],"interaction,":[64],"multi-turn":[66],"professional":[67],"dialogue.":[68],"To":[69],"enable":[70],"trajectory-aware":[71],"each":[73],"run":[74],"is":[75,128,170,188],"recorded":[76],"through":[77],"independent":[79],"evidence":[80],"channels:":[81],"execution":[82],"traces,":[83],"audit":[84],"logs,":[85],"environment":[87],"snapshots,":[88],"yielding":[89],"2,159":[90],"fine-grained":[91],"rubric":[92],"items.":[93],"The":[94],"scoring":[95],"protocol":[96],"evaluates":[97],"Completion,":[98],"Safety,":[99],"Robustness,":[101],"Average":[103],"Score,":[104],"Pass@k,":[105],"Pass^k":[107],"trials":[110],"to":[111,163],"distinguish":[112],"genuine":[113],"capability":[114,169],"from":[115],"lucky":[116],"outcomes.":[117],"Experiments":[118],"on":[119],"14":[120],"frontier":[121],"show":[123],"that:":[124],"(1)":[125],"Trajectory-opaque":[126],"systematically":[129],"unreliable,":[130],"missing":[131],"44%":[132],"violations":[135],"13%":[137],"failures":[140],"detected":[141],"our":[143,184],"framework.":[144],"(2)":[145],"Capability":[146],"does":[147],"not":[148,198],"imply":[149],"consistency,":[150],"Pass@3":[152],"remaining":[153],"stable":[154],"under":[155],"error":[156],"injection":[157],"while":[158],"Pass^3":[159],"dropping":[160],"up":[162],"24":[164],"percentage":[165],"points.":[166],"(3)":[167],"Agent":[168],"strongly":[171],"multi-dimensional,":[172],"model":[174],"rankings":[175],"varying":[176],"task":[178],"groups":[179],"metrics,":[181],"indicating":[182],"that":[183,196],"heterogeneous":[185],"essential.":[189],"Claw-Eval":[190],"highlights":[191],"directions":[192],"developing":[194],"only":[199],"capable":[200],"but":[201],"reliably":[202],"deployable.":[203]},"counts_by_year":[],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2026-04-09T00:00:00"}
