{"id":"https://openalex.org/W7131851703","doi":"https://doi.org/10.48550/arxiv.2602.22683","title":"SUPERGLASSES: Benchmarking Vision Language Models as Intelligent Agents for AI Smart Glasses","display_name":"SUPERGLASSES: Benchmarking Vision Language Models as Intelligent Agents for AI Smart Glasses","publication_year":2026,"publication_date":"2026-02-26","ids":{"openalex":"https://openalex.org/W7131851703","doi":"https://doi.org/10.48550/arxiv.2602.22683"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.22683","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127299136","display_name":"Zhuohang Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jiang, Zhuohang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127328586","display_name":"Xu Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Xu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127300893","display_name":"Haohao Qu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qu, Haohao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lin, Shanru","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Shanru","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127244507","display_name":"Kanglong Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Kanglong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Fan, Wenqi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Wenqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127087880","display_name":"Qing Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Qing","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5127299136"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.001500000013038516,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.0010000000474974513,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.7608000040054321},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6417999863624573},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.6327000260353088},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.525600016117096},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.4481000006198883},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.3871999979019165},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.3580999970436096},{"id":"https://openalex.org/keywords/intelligent-agent","display_name":"Intelligent agent","score":0.3521000146865845}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.7608000040054321},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7513999938964844},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6417999863624573},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.6327000260353088},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.525600016117096},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5206999778747559},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.4481000006198883},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.3871999979019165},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.3580999970436096},{"id":"https://openalex.org/C74072328","wikidata":"https://www.wikidata.org/wiki/Q1142726","display_name":"Intelligent agent","level":2,"score":0.3521000146865845},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3467000126838684},{"id":"https://openalex.org/C150594956","wikidata":"https://www.wikidata.org/wiki/Q1334829","display_name":"Wearable computer","level":2,"score":0.322299987077713},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3156000077724457},{"id":"https://openalex.org/C2780202397","wikidata":"https://www.wikidata.org/wiki/Q2294986","display_name":"Smart objects","level":3,"score":0.30070000886917114},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.29499998688697815},{"id":"https://openalex.org/C152223200","wikidata":"https://www.wikidata.org/wiki/Q3055471","display_name":"Smart environment","level":3,"score":0.29339998960494995},{"id":"https://openalex.org/C54290928","wikidata":"https://www.wikidata.org/wiki/Q4845080","display_name":"Wearable technology","level":3,"score":0.29010000824928284},{"id":"https://openalex.org/C56397880","wikidata":"https://www.wikidata.org/wiki/Q6044094","display_name":"Intelligent decision support system","level":2,"score":0.2890999913215637},{"id":"https://openalex.org/C29794715","wikidata":"https://www.wikidata.org/wiki/Q5362345","display_name":"Smartwatch","level":3,"score":0.27810001373291016},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C114073186","wikidata":"https://www.wikidata.org/wiki/Q2631895","display_name":"Automated planning and scheduling","level":2,"score":0.27090001106262207},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.25279998779296875}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.22683","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.22683","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.22683","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.22683","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.5566666722297668,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"rapid":[1],"advancement":[2],"of":[3,8,78,146],"AI-powered":[4],"smart":[5,41,63,105,155,192],"glasses,":[6],"one":[7],"the":[9,56,76,93,144,186],"hottest":[10],"wearable":[11],"devices,":[12],"has":[13],"unlocked":[14],"new":[15],"frontiers":[16],"for":[17,188],"multimodal":[18,50,154,171],"interaction,":[19],"with":[20,123],"Visual":[21],"Question":[22],"Answering":[23],"(VQA)":[24],"over":[25],"external":[26,83],"knowledge":[27,84],"sources":[28],"emerging":[29],"as":[30],"a":[31,153],"core":[32],"application.":[33],"Existing":[34],"Vision":[35],"Language":[36],"Models":[37],"(VLMs)":[38],"adapted":[39],"to":[40,61],"glasses":[42,64,106,156,193],"are":[43],"typically":[44],"trained":[45],"and":[46,58,67,118,127,170,184],"evaluated":[47],"on":[48,99,135],"traditional":[49],"datasets;":[51],"however,":[52],"these":[53],"datasets":[54],"lack":[55],"variety":[57],"realism":[59],"needed":[60],"reflect":[62],"usage":[65],"scenarios":[66],"diverge":[68],"from":[69],"their":[70],"specific":[71],"challenges,":[72],"where":[73],"accurately":[74],"identifying":[75],"object":[77,166],"interest":[79],"must":[80],"precede":[81],"any":[82],"retrieval.":[85],"To":[86,142],"bridge":[87],"this":[88,136],"gap,":[89],"we":[90,149],"introduce":[91],"SUPERGLASSES,":[92],"first":[94],"comprehensive":[95],"VQA":[96,194],"benchmark":[97],"built":[98],"real-world":[100],"data":[101],"entirely":[102],"collected":[103],"by":[104,163,181],"devices.":[107],"SUPERGLASSES":[108],"comprises":[109],"2,422":[110],"egocentric":[111],"image-question":[112],"pairs":[113],"spanning":[114],"14":[115],"image":[116],"domains":[117],"8":[119],"query":[120,168],"categories,":[121],"enriched":[122],"full":[124],"search":[125],"trajectories":[126],"reasoning":[128],"annotations.":[129],"We":[130],"evaluate":[131],"26":[132],"representative":[133],"VLMs":[134],"benchmark,":[137],"revealing":[138],"significant":[139],"performance":[140],"gaps.":[141],"address":[143],"limitations":[145],"existing":[147],"models,":[148],"further":[150],"propose":[151],"SUPERLENS,":[152],"agent":[157,175],"that":[158],"enables":[159],"retrieval-augmented":[160],"answer":[161],"generation":[162],"integrating":[164],"automatic":[165],"detection,":[167],"decoupling,":[169],"web":[172],"search.":[173],"Our":[174],"achieves":[176],"state-of-the-art":[177],"performance,":[178],"surpassing":[179],"GPT-4o":[180],"2.19":[182],"percent,":[183],"highlights":[185],"need":[187],"task-specific":[189],"solutions":[190],"in":[191],"scenarios.":[195]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-28T00:00:00"}
