{"id":"https://openalex.org/W7160316704","doi":"https://doi.org/10.48550/arxiv.2605.02630","title":"AutoFocus: Uncertainty-Aware Active Visual Search for GUI Grounding","display_name":"AutoFocus: Uncertainty-Aware Active Visual Search for GUI Grounding","publication_year":2026,"publication_date":"2026-05-04","ids":{"openalex":"https://openalex.org/W7160316704","doi":"https://doi.org/10.48550/arxiv.2605.02630"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.02630","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02630","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.02630","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5053695187","display_name":"Ruilin Yao","orcid":"https://orcid.org/0009-0002-6654-2294"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Ruilin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135308339","display_name":"Shegnwu Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Shegnwu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135301045","display_name":"Tianyu Zou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zou, Tianyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003832579","display_name":"Shili Xiong","orcid":"https://orcid.org/0000-0003-1167-525X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Shili","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123029946","display_name":"Yi Rong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rong, Yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9682999849319458,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9682999849319458,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.002899999963119626,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.00279999990016222,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.6949999928474426},{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.6067000031471252},{"id":"https://openalex.org/keywords/zoom","display_name":"Zoom","score":0.5284000039100647},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.4417000114917755},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.42080000042915344},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.3788999915122986},{"id":"https://openalex.org/keywords/rectangle","display_name":"Rectangle","score":0.37720000743865967},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.35670000314712524},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.34369999170303345}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7896000146865845},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.6949999928474426},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.6067000031471252},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5442000031471252},{"id":"https://openalex.org/C124913957","wikidata":"https://www.wikidata.org/wiki/Q1232548","display_name":"Zoom","level":3,"score":0.5284000039100647},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4675999879837036},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.4417000114917755},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.42080000042915344},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3788999915122986},{"id":"https://openalex.org/C2781302577","wikidata":"https://www.wikidata.org/wiki/Q209","display_name":"Rectangle","level":2,"score":0.37720000743865967},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.35670000314712524},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.34369999170303345},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.33079999685287476},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.32910001277923584},{"id":"https://openalex.org/C3770464","wikidata":"https://www.wikidata.org/wiki/Q775963","display_name":"Smoothing","level":2,"score":0.3262999951839447},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.3156999945640564},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.30820000171661377},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.2856999933719635},{"id":"https://openalex.org/C4924752","wikidata":"https://www.wikidata.org/wiki/Q184148","display_name":"Plug-in","level":2,"score":0.2827000021934509},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.2815999984741211},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.28119999170303345},{"id":"https://openalex.org/C37789001","wikidata":"https://www.wikidata.org/wiki/Q782543","display_name":"Graphical user interface","level":2,"score":0.2718000113964081},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.27160000801086426},{"id":"https://openalex.org/C158495155","wikidata":"https://www.wikidata.org/wiki/Q2369151","display_name":"Visual search","level":2,"score":0.2678000032901764},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.26260000467300415},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2574000060558319},{"id":"https://openalex.org/C155846161","wikidata":"https://www.wikidata.org/wiki/Q1143367","display_name":"Graphical model","level":2,"score":0.25369998812675476}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.02630","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02630","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.02630","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02630","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"(VLMs)":[2],"have":[3],"enabled":[4],"autonomous":[5],"GUI":[6,84],"agents":[7],"that":[8,90],"translate":[9],"natural":[10],"language":[11],"instructions":[12],"into":[13,117],"executable":[14],"screen":[15],"coordinates.":[16],"However,":[17],"grounding":[18],"performance":[19],"degrades":[20],"in":[21,93],"high-resolution":[22],"interfaces,":[23],"where":[24,61],"dense":[25],"layouts":[26],"and":[27,38,65,112,135,139,168,176],"small":[28],"interactive":[29],"elements":[30],"expose":[31],"a":[32,55,76,104],"resolution":[33],"gap":[34],"between":[35],"modern":[36],"displays":[37],"model":[39],"input":[40],"constraints.":[41],"Existing":[42],"zoom-in":[43],"strategies":[44],"rely":[45],"on":[46,129,166],"fixed":[47],"anchors,":[48],"heuristic":[49],"grids,":[50],"or":[51],"reinforcement":[52],"learning,":[53],"lacking":[54],"principled":[56],"mechanism":[57],"to":[58,103,143],"adaptively":[59],"determine":[60],"refinement":[62],"is":[63,89],"needed":[64],"how":[66],"much":[67],"spatial":[68,98,121],"uncertainty":[69],"should":[70],"be":[71],"explored.":[72],"We":[73],"propose":[74],"AutoFocus,":[75],"training-free,":[77],"uncertainty-aware":[78],"active":[79],"visual":[80,151],"search":[81],"framework":[82],"for":[83],"grounding.":[85],"Our":[86],"key":[87],"insight":[88],"token-level":[91],"perplexity":[92],"coordinate":[94,110],"generation":[95],"naturally":[96],"reflects":[97],"uncertainty.":[99,127],"Rather":[100],"than":[101],"committing":[102],"single":[105],"prediction,":[106],"AutoFocus":[107],"samples":[108],"multiple":[109],"hypotheses":[111],"converts":[113],"their":[114],"axial":[115],"perplexities":[116],"an":[118],"anisotropic":[119],"gaussian":[120],"probability":[122],"field,":[123,131],"explicitly":[124],"modeling":[125],"directional":[126],"Based":[128],"this":[130],"we":[132],"generate":[133],"global":[134],"local":[136],"region":[137],"proposals":[138],"introduce":[140],"Shape-Aware":[141],"Zooming":[142],"balance":[144],"tight":[145],"localization":[146],"with":[147],"contextual":[148],"preservation.":[149],"A":[150],"prompt-based":[152],"aggregation":[153],"step":[154],"then":[155],"selects":[156],"the":[157],"most":[158],"consistent":[159,171],"prediction":[160],"via":[161],"structured":[162],"comparison.":[163],"Extensive":[164],"experiments":[165],"ScreenSpot-Pro":[167],"ScreenSpot-V2":[169],"demonstrate":[170],"improvements":[172],"across":[173],"both":[174],"general-purpose":[175],"GUI-specialized":[177],"VLMs.":[178]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-06T00:00:00"}
