{"id":"https://openalex.org/W7155509716","doi":"https://doi.org/10.48550/arxiv.2604.21375","title":"VLAA-GUI: Knowing When to Stop, Recover, and Search, A Modular Framework for GUI Automation","display_name":"VLAA-GUI: Knowing When to Stop, Recover, and Search, A Modular Framework for GUI Automation","publication_year":2026,"publication_date":"2026-04-23","ids":{"openalex":"https://openalex.org/W7155509716","doi":"https://doi.org/10.48550/arxiv.2604.21375"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.21375","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21375","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.21375","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134542012","display_name":"Qijun Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Han, Qijun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048179343","display_name":"Haoqin Tu","orcid":"https://orcid.org/0000-0002-5627-249X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tu, Haoqin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134513931","display_name":"Zijun Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zijun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134552946","display_name":"Haoyue Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Haoyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134505889","display_name":"Yiyang Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yiyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134564280","display_name":"Nancy Lau","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lau, Nancy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016892225","display_name":"\u00c1lvaro A. C\u00e1rdenas","orcid":"https://orcid.org/0000-0002-5142-9750"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cardenas, Alvaro A.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134514053","display_name":"Yuhui Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Yuhui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056643573","display_name":"Ran Xu","orcid":"https://orcid.org/0000-0003-2913-9420"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Ran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032046813","display_name":"Caiming Xiong","orcid":"https://orcid.org/0000-0003-0349-8628"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Caiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134517060","display_name":"Zeyu Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Zeyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051534896","display_name":"Huaxiu Yao","orcid":"https://orcid.org/0000-0002-8691-9629"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Huaxiu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134466837","display_name":"Yuyin Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yuyin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5022344556","display_name":"Cihang Xie","orcid":"https://orcid.org/0000-0003-1243-8045"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Cihang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5134542012"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10639","display_name":"Advanced Software Engineering Methodologies","score":0.16680000722408295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10639","display_name":"Advanced Software Engineering Methodologies","score":0.16680000722408295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12607","display_name":"Personal Information Management and User Behavior","score":0.13830000162124634,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10444","display_name":"Context-Aware Activity Recognition Systems","score":0.06360000371932983,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.6919000148773193},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.6157000064849854},{"id":"https://openalex.org/keywords/automation","display_name":"Automation","score":0.5414999723434448},{"id":"https://openalex.org/keywords/circuit-breaker","display_name":"Circuit breaker","score":0.35839998722076416},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.3425000011920929},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.32580000162124634},{"id":"https://openalex.org/keywords/autonomous-agent","display_name":"Autonomous agent","score":0.32339999079704285},{"id":"https://openalex.org/keywords/semantic-reasoner","display_name":"Semantic reasoner","score":0.3197999894618988}],"concepts":[{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.6919000148773193},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6913999915122986},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.6157000064849854},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.5414999723434448},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.421999990940094},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3752000033855438},{"id":"https://openalex.org/C61352017","wikidata":"https://www.wikidata.org/wiki/Q211058","display_name":"Circuit breaker","level":2,"score":0.35839998722076416},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.3425000011920929},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.32580000162124634},{"id":"https://openalex.org/C13687954","wikidata":"https://www.wikidata.org/wiki/Q4826847","display_name":"Autonomous agent","level":2,"score":0.32339999079704285},{"id":"https://openalex.org/C9616225","wikidata":"https://www.wikidata.org/wiki/Q3929429","display_name":"Semantic reasoner","level":2,"score":0.3197999894618988},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3149000108242035},{"id":"https://openalex.org/C74072328","wikidata":"https://www.wikidata.org/wiki/Q1142726","display_name":"Intelligent agent","level":2,"score":0.3025999963283539},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2915000021457672},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2808000147342682},{"id":"https://openalex.org/C17231256","wikidata":"https://www.wikidata.org/wiki/Q5156540","display_name":"Completeness (order theory)","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C67953723","wikidata":"https://www.wikidata.org/wiki/Q192525","display_name":"Workstation","level":2,"score":0.2671999931335449},{"id":"https://openalex.org/C111498074","wikidata":"https://www.wikidata.org/wiki/Q173326","display_name":"Formal verification","level":2,"score":0.2662999927997589},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.2554999887943268}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.21375","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21375","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.21375","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21375","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.5817243456840515,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Autonomous":[0],"GUI":[1,35],"agents":[2,10,21],"face":[3],"two":[4,178],"fundamental":[5],"challenges:":[6],"early":[7],"stopping,":[8],"where":[9,20],"prematurely":[11],"declare":[12],"success":[13,61],"without":[14,28],"verifiable":[15],"evidence,":[16],"and":[17,52,63,108,148,173,182,193],"repetitive":[18],"loops,":[19],"cycle":[22],"through":[23],"the":[24,45,200,236,246],"same":[25],"failing":[26],"actions":[27,147],"recovery.":[29],"We":[30,139,162],"present":[31],"VLAA-GUI,":[32],"a":[33,55,88,128,142,149,210,223,227],"modular":[34],"agentic":[36],"framework":[37],"built":[38],"around":[39],"three":[40,198,218],"integrated":[41],"components":[42,220],"that":[43,74,216,245],"guide":[44],"system":[46],"on":[47,158,177,188,191,195,207],"when":[48,160,235],"to":[49,112],"Stop,":[50],"Recover,":[51],"Search.":[53],"First,":[54],"mandatory":[56,89],"Completeness":[57],"Verifier":[58],"enforces":[59],"UI-observable":[60],"criteria":[62],"verification":[64],"at":[65],"every":[66],"finish":[67],"step":[68,237],"--":[69],"with":[70,78,131,180],"an":[71,116],"agent-level":[72],"verifier":[73],"cross-examines":[75],"completion":[76],"claims":[77],"decision":[79],"rules,":[80],"rejecting":[81],"those":[82],"lacking":[83],"direct":[84],"visual":[85],"evidence.":[86],"Second,":[87],"Loop":[90,247],"Breaker":[91,248],"provides":[92],"multi-tier":[93],"filtering:":[94],"switching":[95],"interaction":[96],"mode":[97],"after":[98,104],"repeated":[99],"failures,":[100],"forcing":[101],"strategy":[102,113],"changes":[103],"persistent":[105],"screen-state":[106],"recurrence,":[107],"binding":[109],"reflection":[110],"signals":[111],"shifts.":[114],"Third,":[115],"on-demand":[117],"Search":[118],"Agent":[119,144,151],"searches":[120],"online":[121],"for":[122,145,152,253],"unfamiliar":[123],"workflows":[124],"by":[125],"directly":[126],"querying":[127],"capable":[129],"LLM":[130],"search":[132],"ability,":[133],"returning":[134],"results":[135],"as":[136],"plain":[137],"text.":[138],"additionally":[140],"integrate":[141],"Coding":[143],"code-intensive":[146],"Grounding":[150],"precise":[153],"action":[154],"grounding,":[155],"both":[156,189],"invoked":[157],"demand":[159],"required.":[161],"evaluate":[163],"VLAA-GUI":[164],"across":[165],"five":[166,201],"top-tier":[167],"backbones,":[168],"including":[169],"Opus":[170],"4.5,":[171],"4.6":[172],"Gemini":[174],"3.1":[175],"Pro,":[176],"benchmarks":[179],"Linux":[181],"Windows":[183],"tasks,":[184],"achieving":[185],"top":[186],"performance":[187,205],"(77.5%":[190],"OSWorld":[192,208],"61.0%":[194],"WindowsAgentArena).":[196],"Notably,":[197],"of":[199],"backbones":[202],"surpass":[203],"human":[204],"(72.4%)":[206],"in":[209],"single":[211],"pass.":[212],"Ablation":[213],"studies":[214],"show":[215],"all":[217],"proposed":[219],"consistently":[221],"improve":[222],"strong":[224],"backbone,":[225],"while":[226],"weaker":[228],"backbone":[229],"benefits":[230],"more":[231],"from":[232],"these":[233],"tools":[234],"budget":[238],"is":[239],"sufficient.":[240],"Further":[241],"analysis":[242],"also":[243],"shows":[244],"nearly":[249],"halves":[250],"wasted":[251],"steps":[252],"loop-prone":[254],"models.":[255]},"counts_by_year":[],"updated_date":"2026-04-25T06:06:54.107920","created_date":"2026-04-25T00:00:00"}
