{"id":"https://openalex.org/W7147545502","doi":"https://doi.org/10.48550/arxiv.2603.29020","title":"Emergence WebVoyager: Toward Consistent and Transparent Evaluation of (Web) Agents in The Wild","display_name":"Emergence WebVoyager: Toward Consistent and Transparent Evaluation of (Web) Agents in The Wild","publication_year":2026,"publication_date":"2026-03-30","ids":{"openalex":"https://openalex.org/W7147545502","doi":"https://doi.org/10.48550/arxiv.2603.29020"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.29020","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29020","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.29020","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030835457","display_name":"Deepak Akkil","orcid":"https://orcid.org/0000-0002-8689-8768"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Akkil, Deepak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078324170","display_name":"Mowafak Allaham","orcid":"https://orcid.org/0000-0001-9211-4598"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Allaham, Mowafak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132646452","display_name":"Amal Raj","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Raj, Amal","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052678088","display_name":"Tamer Abuelsaad","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abuelsaad, Tamer","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5062098685","display_name":"Ravi Kokku","orcid":"https://orcid.org/0000-0001-6418-1689"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kokku, Ravi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5030835457"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.22769999504089355,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.22769999504089355,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.13680000603199005,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.09009999781847,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6901000142097473},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.649399995803833},{"id":"https://openalex.org/keywords/clarity","display_name":"CLARITY","score":0.6274999976158142},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5284000039100647},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5038999915122986},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.3644999861717224},{"id":"https://openalex.org/keywords/variation","display_name":"Variation (astronomy)","score":0.3359000086784363}],"concepts":[{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6901000142097473},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.649399995803833},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6283000111579895},{"id":"https://openalex.org/C2777146004","wikidata":"https://www.wikidata.org/wiki/Q14949826","display_name":"CLARITY","level":2,"score":0.6274999976158142},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5284000039100647},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5038999915122986},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4514999985694885},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3659000098705292},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.3644999861717224},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.3359000086784363},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.302700012922287},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.29739999771118164},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.28780001401901245},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2874000072479248},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2599000036716461},{"id":"https://openalex.org/C2780873155","wikidata":"https://www.wikidata.org/wiki/Q392811","display_name":"Agent-based model","level":2,"score":0.2572000026702881}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.29020","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29020","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.29020","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29020","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reliable":[0],"evaluation":[1,36,83],"of":[2,51,77,102,136,150],"AI":[3,34],"agents":[4,22],"operating":[5],"in":[6,32,42,109],"complex,":[7],"real-world":[8],"environments":[9],"requires":[10],"methodologies":[11],"that":[12,38,59,81],"are":[13,23,39],"robust,":[14],"transparent,":[15],"and":[16,56,62,94,107,113,128,156],"contextually":[17],"aligned":[18],"with":[19,131],"the":[20,78,141,148],"tasks":[21],"intended":[24],"to":[25,118],"perform.":[26],"This":[27],"study":[28],"identifies":[29],"persistent":[30],"shortcomings":[31],"existing":[33],"agent":[35,44,159],"practices":[37],"particularly":[40],"acute":[41],"web":[43,158],"evaluation,":[45],"as":[46],"exemplified":[47],"by":[48,145],"our":[49,151],"audit":[50],"WebVoyager,":[52,73],"including":[53],"task-framing":[54],"ambiguity":[55],"operational":[57],"variability":[58],"hinder":[60],"meaningful":[61],"reproducible":[63],"performance":[64,124],"comparisons.":[65],"To":[66],"address":[67],"these":[68],"challenges,":[69],"we":[70],"introduce":[71],"Emergence":[72,96],"an":[74,99,132],"enhanced":[75],"version":[76],"WebVoyager":[79,97],"benchmark":[80],"standardizes":[82],"methodology":[84],"through":[85],"clear":[86],"guidelines":[87],"for":[88,153],"task":[89,111,129],"instantiation,":[90],"failure":[91],"handling,":[92],"annotation,":[93],"reporting.":[95],"achieves":[98],"inter-annotator":[100],"agreement":[101],"95.9\\%,":[103],"indicating":[104],"improved":[105],"clarity":[106],"reliability":[108],"both":[110],"formulation":[112],"evaluation.":[114,160],"Applying":[115],"this":[116],"framework":[117],"evaluate":[119],"OpenAI":[120],"Operator":[121],"reveals":[122],"substantial":[123],"variation":[125],"across":[126],"domains":[127],"types,":[130],"overall":[133],"success":[134],"rate":[135],"68.6\\%,":[137],"substantially":[138],"lower":[139],"than":[140],"87\\%":[142],"previously":[143],"reported":[144],"OpenAI,":[146],"demonstrating":[147],"utility":[149],"approach":[152],"more":[154],"rigorous":[155],"comparable":[157]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-02T00:00:00"}
