{"id":"https://openalex.org/W7154383465","doi":"https://doi.org/10.48550/arxiv.2604.11201","title":"CocoaBench: Evaluating Unified Digital Agents in the Wild","display_name":"CocoaBench: Evaluating Unified Digital Agents in the Wild","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154383465","doi":"https://doi.org/10.48550/arxiv.2604.11201"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11201","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11201","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11201","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133582524","display_name":"CocoaBench Team","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"CocoaBench Team","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133592416","display_name":"Shibo Hao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hao, Shibo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101685928","display_name":"Zhining Zhang","orcid":"https://orcid.org/0000-0001-6357-1546"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhining","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133610639","display_name":"Zhiqi Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Zhiqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133557725","display_name":"Tianyang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Tianyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133607692","display_name":"Yuheng Zha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zha, Yuheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133592779","display_name":"Qiyue Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Qiyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133595289","display_name":"Jixuan Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jixuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133605994","display_name":"Zilong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zilong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058829926","display_name":"Zhoujun Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Zhoujun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133987685","display_name":"Haoxiang Zhang","orcid":"https://orcid.org/0009-0001-2997-7726"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Haoxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133597968","display_name":"Junli Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Junli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133582402","display_name":"Hexi Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Hexi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133617955","display_name":"Boyuan Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Boyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133584673","display_name":"Kun Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Kun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133613844","display_name":"Yu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133564066","display_name":"Feng Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Feng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133615637","display_name":"Licheng Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Licheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133602040","display_name":"Yijiang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yijiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133594444","display_name":"Zhifei Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zhifei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133584773","display_name":"Zhengtao Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Zhengtao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095886523","display_name":"Pracha Promthaw","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Promthaw, Pracha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133573817","display_name":"Tommaso Cerruti","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cerruti, Tommaso","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133624835","display_name":"Xiaohan Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Xiaohan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133568386","display_name":"Ziqiao Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Ziqiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133554741","display_name":"Jingbo Shang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shang, Jingbo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133605398","display_name":"Lianhui Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Lianhui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133620244","display_name":"Julian McAuley","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"McAuley, Julian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074244591","display_name":"Eric Poe Xing","orcid":"https://orcid.org/0000-0002-3683-4280"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xing, Eric P.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133598012","display_name":"Zhengzhong Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhengzhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103161860","display_name":"Rupesh Kumar Srivastava","orcid":"https://orcid.org/0000-0002-4032-4267"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Srivastava, Rupesh Kumar","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133563417","display_name":"Zhiting Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Zhiting","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":32,"corresponding_author_ids":["https://openalex.org/A5133582524"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.16290000081062317,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.16290000081062317,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.13699999451637268,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1103999987244606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6026999950408936},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.597100019454956},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.5268999934196472},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5112000107765198},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.45179998874664307},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.29330000281333923}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7006999850273132},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6026999950408936},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.597100019454956},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.5268999934196472},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5112000107765198},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.45179998874664307},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.4156999886035919},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3569999933242798},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3140000104904175},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3000999987125397},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.29330000281333923},{"id":"https://openalex.org/C5894958","wikidata":"https://www.wikidata.org/wiki/Q2297769","display_name":"Software agent","level":2,"score":0.28380000591278076},{"id":"https://openalex.org/C165825675","wikidata":"https://www.wikidata.org/wiki/Q1399743","display_name":"Model-based testing","level":4,"score":0.2822999954223633},{"id":"https://openalex.org/C2984968299","wikidata":"https://www.wikidata.org/wiki/Q1077784","display_name":"Software tool","level":3,"score":0.25949999690055847},{"id":"https://openalex.org/C149091818","wikidata":"https://www.wikidata.org/wiki/Q2429814","display_name":"Software system","level":3,"score":0.2547999918460846}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11201","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11201","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11201","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11201","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure","score":0.6535981893539429}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"LLM":[0],"agents":[1,50,63,121],"now":[2],"perform":[3],"strongly":[4],"in":[5,37,147],"software":[6],"engineering,":[7],"deep":[8],"research,":[9],"GUI":[10],"automation,":[11],"and":[12,20,76,85,96,149,153,155],"various":[13],"other":[14],"applications,":[15],"while":[16],"recent":[17],"agent":[18,101],"scaffolds":[19],"models":[21],"are":[22,79],"increasingly":[23],"integrating":[24],"these":[25,35],"capabilities":[26,36],"into":[27],"unified":[28,61],"systems.":[29],"Yet,":[30],"most":[31],"evaluations":[32],"still":[33],"test":[34],"isolation,":[38],"which":[39],"leaves":[40],"a":[41,58,107],"gap":[42],"for":[43,60,111,145],"more":[44],"diverse":[45,100],"use":[46,152],"cases":[47],"that":[48,69,119],"require":[49,70],"to":[51,142],"combine":[52],"different":[53],"capabilities.":[54],"We":[55,103],"introduce":[56],"CocoaBench,":[57,127],"benchmark":[59],"digital":[62],"built":[64],"from":[65,124],"human-designed,":[66],"long-horizon":[67],"tasks":[68],"flexible":[71],"composition":[72],"of":[73],"vision,":[74],"search,":[75],"coding.":[77],"Tasks":[78],"specified":[80],"only":[81,134],"by":[82],"an":[83,86],"instruction":[84],"automatic":[87],"evaluation":[88,98],"function":[89],"over":[90],"the":[91,129],"final":[92],"output,":[93],"enabling":[94],"reliable":[95,125],"scalable":[97],"across":[99,114],"infrastructures.":[102],"also":[104],"present":[105],"CocoaAgent,":[106],"lightweight":[108],"shared":[109],"scaffold":[110],"controlled":[112],"comparison":[113],"model":[115],"backbones.":[116],"Experiments":[117],"show":[118],"current":[120],"remain":[122],"far":[123],"on":[126],"with":[128],"best":[130],"evaluated":[131],"system":[132],"achieving":[133],"45.1%":[135],"success":[136],"rate.":[137],"Our":[138],"analysis":[139],"further":[140],"points":[141],"substantial":[143],"room":[144],"improvement":[146],"reasoning":[148],"planning,":[150],"tool":[151],"execution,":[154],"visual":[156],"grounding.":[157]},"counts_by_year":[],"updated_date":"2026-04-29T09:16:38.111599","created_date":"2026-04-15T00:00:00"}
