{"id":"https://openalex.org/W7138171633","doi":"https://doi.org/10.48550/arxiv.2603.15039","title":"GUI-CEval: A Hierarchical and Comprehensive Chinese Benchmark for Mobile GUI Agents","display_name":"GUI-CEval: A Hierarchical and Comprehensive Chinese Benchmark for Mobile GUI Agents","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7138171633","doi":"https://doi.org/10.48550/arxiv.2603.15039"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.15039","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15039","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.15039","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129748007","display_name":"Yang Li","orcid":"https://orcid.org/0009-0003-0093-4284"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129711916","display_name":"Yuchen Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yuchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129643841","display_name":"Haoyu Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Haoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129716850","display_name":"Zhiqiang Xia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xia, Zhiqiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129733101","display_name":"Hongzhen Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hongzhen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085050181","display_name":"Kaiyang Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Kaiyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087833537","display_name":"Changpeng Yang","orcid":"https://orcid.org/0000-0002-1824-3831"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Changpeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129735238","display_name":"Jinyang Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Jinyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129744033","display_name":"Jiaming Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Jiaming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129676129","display_name":"Runyu Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Runyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129708508","display_name":"Ying Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Ying","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.47350001335144043,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.47350001335144043,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.14970000088214874,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.051600001752376556,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7671999931335449},{"id":"https://openalex.org/keywords/strengths-and-weaknesses","display_name":"Strengths and weaknesses","score":0.5205000042915344},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5073000192642212},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5042999982833862},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4796000123023987},{"id":"https://openalex.org/keywords/graphical-user-interface","display_name":"Graphical user interface","score":0.39910000562667847},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.39169999957084656},{"id":"https://openalex.org/keywords/mobile-device","display_name":"Mobile device","score":0.37540000677108765}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7671999931335449},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.732200026512146},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5615000128746033},{"id":"https://openalex.org/C63882131","wikidata":"https://www.wikidata.org/wiki/Q17122954","display_name":"Strengths and weaknesses","level":2,"score":0.5205000042915344},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5073000192642212},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5042999982833862},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4796000123023987},{"id":"https://openalex.org/C37789001","wikidata":"https://www.wikidata.org/wiki/Q782543","display_name":"Graphical user interface","level":2,"score":0.39910000562667847},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.39169999957084656},{"id":"https://openalex.org/C186967261","wikidata":"https://www.wikidata.org/wiki/Q5082128","display_name":"Mobile device","level":2,"score":0.37540000677108765},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.34389999508857727},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3379000127315521},{"id":"https://openalex.org/C115121344","wikidata":"https://www.wikidata.org/wiki/Q6887140","display_name":"Mobile interaction","level":3,"score":0.31200000643730164},{"id":"https://openalex.org/C2988145974","wikidata":"https://www.wikidata.org/wiki/Q620615","display_name":"Mobile apps","level":2,"score":0.3075000047683716},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2980000078678131},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.29440000653266907},{"id":"https://openalex.org/C144543869","wikidata":"https://www.wikidata.org/wiki/Q2738570","display_name":"Mobile computing","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C95491727","wikidata":"https://www.wikidata.org/wiki/Q992968","display_name":"Mobile telephony","level":3,"score":0.275299996137619},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.2700999975204468},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.2612999975681305},{"id":"https://openalex.org/C2780102126","wikidata":"https://www.wikidata.org/wiki/Q10928179","display_name":"Online and offline","level":2,"score":0.2517000138759613}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.15039","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15039","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.15039","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15039","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"progress":[1],"in":[2,166,175],"Multimodal":[3],"Large":[4],"Language":[5],"Models":[6],"(MLLMs)":[7],"has":[8],"enabled":[9],"mobile":[10,40,84,197],"GUI":[11,50,85,198],"agents":[12],"capable":[13],"of":[14,37,195],"visual":[15],"perception,":[16,119],"cross-modal":[17],"reasoning,":[18],"and":[19,28,34,58,102,112,123,129,138,146,156,169,184,191],"interactive":[20],"control.":[21],"However,":[22],"existing":[23],"benchmarks":[24],"are":[25,127],"largely":[26],"English-centric":[27],"fail":[29],"to":[30,61,69,135,187],"capture":[31],"the":[32,38,63,78,193],"linguistic":[33],"interaction":[35],"characteristics":[36],"Chinese":[39,83,196],"ecosystem.":[41],"They":[42],"also":[43],"focus":[44],"on":[45,89,142],"isolated":[46],"skills":[47],"such":[48,153],"as":[49,154],"grounding":[51],"or":[52],"offline":[53],"agent,":[54],"lacking":[55],"a":[56,104,182],"unified":[57],"fine-grained":[59],"framework":[60],"assess":[62],"full":[64],"capability":[65,189],"chain":[66],"from":[67],"perception":[68],"execution.":[70],"To":[71],"address":[72],"this":[73],"gap,":[74],"we":[75],"introduce":[76],"GUI-CEval,":[77],"first":[79],"comprehensive":[80,183],"benchmark":[81,186],"for":[82],"agents,":[86],"built":[87],"entirely":[88],"physical":[90],"device":[91,100],"environments.":[92],"GUI-CEval":[93,180],"spans":[94],"201":[95],"mainstream":[96],"apps":[97],"across":[98],"four":[99],"types":[101],"adopts":[103],"two-level":[105],"structure":[106],"that":[107,150],"evaluates":[108],"both":[109],"atomic":[110],"abilities":[111],"realistic":[113],"application-level":[114],"performance":[115],"along":[116],"five":[117],"dimensions:":[118],"planning,":[120],"reflection,":[121],"execution,":[122],"evaluation.":[124],"All":[125],"data":[126],"collected":[128],"verified":[130],"through":[131],"multi-stage":[132],"manual":[133],"processes":[134],"ensure":[136],"authenticity":[137],"reproducibility.":[139],"Extensive":[140],"experiments":[141],"20":[143],"representative":[144],"MLLMs":[145,161],"multi-agent":[147],"systems":[148],"show":[149],"while":[151],"models":[152],"Qwen2.5-VL":[155],"UI-TARS":[157],"perform":[158],"competitively,":[159],"most":[160],"still":[162],"exhibit":[163],"clear":[164],"weaknesses":[165],"reflective":[167],"decision-making":[168],"post-action":[170],"self-evaluation,":[171],"limiting":[172],"their":[173],"reliability":[174],"real-world":[176],"interactions.":[177],"We":[178],"hope":[179],"provides":[181],"interpretable":[185],"guide":[188],"diagnosis":[190],"advance":[192],"development":[194],"agents.":[199]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-18T00:00:00"}
