{"id":"https://openalex.org/W7128740365","doi":"https://doi.org/10.48550/arxiv.2602.11025","title":"Reality Copilot: Voice-First Human-AI Collaboration in Mixed Reality Using Large Multimodal Models","display_name":"Reality Copilot: Voice-First Human-AI Collaboration in Mixed Reality Using Large Multimodal Models","publication_year":2026,"publication_date":"2026-02-11","ids":{"openalex":"https://openalex.org/W7128740365","doi":"https://doi.org/10.48550/arxiv.2602.11025"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.11025","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5007775787","display_name":"Liuchuan Yu","orcid":"https://orcid.org/0000-0003-2375-1862"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yu, Liuchuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125768332","display_name":"Yongqi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yongqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5084967276","display_name":"Lap-Fai Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Lap-Fai","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5007775787"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.49790000915527344,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.49790000915527344,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10789","display_name":"Interactive and Immersive Displays","score":0.20900000631809235,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10888","display_name":"Augmented Reality Applications","score":0.04600000008940697,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mixed-reality","display_name":"Mixed reality","score":0.8633999824523926},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.6514000296592712},{"id":"https://openalex.org/keywords/gesture","display_name":"Gesture","score":0.6323000192642212},{"id":"https://openalex.org/keywords/augmented-reality","display_name":"Augmented reality","score":0.5537999868392944},{"id":"https://openalex.org/keywords/computer-mediated-reality","display_name":"Computer-mediated reality","score":0.4088999927043915},{"id":"https://openalex.org/keywords/virtual-reality","display_name":"Virtual reality","score":0.38929998874664307},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.3874000012874603},{"id":"https://openalex.org/keywords/entertainment","display_name":"Entertainment","score":0.38019999861717224}],"concepts":[{"id":"https://openalex.org/C206776904","wikidata":"https://www.wikidata.org/wiki/Q1758389","display_name":"Mixed reality","level":3,"score":0.8633999824523926},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7305999994277954},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.7290999889373779},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.6514000296592712},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.6323000192642212},{"id":"https://openalex.org/C153715457","wikidata":"https://www.wikidata.org/wiki/Q254183","display_name":"Augmented reality","level":2,"score":0.5537999868392944},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.44510000944137573},{"id":"https://openalex.org/C134202134","wikidata":"https://www.wikidata.org/wiki/Q5157399","display_name":"Computer-mediated reality","level":4,"score":0.4088999927043915},{"id":"https://openalex.org/C194969405","wikidata":"https://www.wikidata.org/wiki/Q170519","display_name":"Virtual reality","level":2,"score":0.38929998874664307},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.3874000012874603},{"id":"https://openalex.org/C512170562","wikidata":"https://www.wikidata.org/wiki/Q173799","display_name":"Entertainment","level":2,"score":0.38019999861717224},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.33239999413490295},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.3224000036716461},{"id":"https://openalex.org/C2780554381","wikidata":"https://www.wikidata.org/wiki/Q2063340","display_name":"Sensemaking","level":2,"score":0.3192000091075897},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.30820000171661377},{"id":"https://openalex.org/C2778152352","wikidata":"https://www.wikidata.org/wiki/Q5165061","display_name":"Content (measure theory)","level":2,"score":0.29679998755455017},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.28940001130104065},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.27239999175071716},{"id":"https://openalex.org/C13854087","wikidata":"https://www.wikidata.org/wiki/Q1061656","display_name":"Interaction design","level":2,"score":0.27219998836517334},{"id":"https://openalex.org/C71611378","wikidata":"https://www.wikidata.org/wiki/Q5165191","display_name":"Contextual design","level":3,"score":0.2590999901294708},{"id":"https://openalex.org/C8678698","wikidata":"https://www.wikidata.org/wiki/Q4801094","display_name":"Artificial reality","level":5,"score":0.25060001015663147}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.11025","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.11025","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.11025","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.11025","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.47664591670036316,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Multimodal":[1],"Models":[2],"(LMMs)":[3],"have":[4,42],"shown":[5],"strong":[6],"potential":[7],"for":[8,91],"assisting":[9],"users":[10],"in":[11,37,147],"tasks,":[12],"such":[13,30],"as":[14,31],"programming,":[15],"content":[16,112,132],"creation,":[17],"and":[18,33,49,68,114,133],"information":[19,116],"access,":[20],"yet":[21],"their":[22],"interaction":[23],"remains":[24],"largely":[25],"limited":[26,70],"to":[27,74,97,120],"traditional":[28],"interfaces":[29],"desktops":[32],"smartphones.":[34],"Meanwhile,":[35],"advances":[36],"mixed":[38,92,148],"reality":[39,93],"(MR)":[40],"hardware":[41],"enabled":[43],"applications":[44],"that":[45,94],"extend":[46],"beyond":[47],"entertainment":[48],"into":[50],"everyday":[51],"use.":[52],"However,":[53],"most":[54],"existing":[55],"MR":[56],"systems":[57],"rely":[58],"primarily":[59],"on":[60],"manual":[61],"input":[62],"(e.g.,":[63],"hand":[64],"gestures":[65],"or":[66],"controllers)":[67],"provide":[69],"intelligent":[71],"assistance":[72],"due":[73],"the":[75,140],"lack":[76],"of":[77,107,143],"integration":[78],"with":[79],"large-scale":[80],"AI":[81],"models.":[82],"We":[83],"present":[84],"Reality":[85,123],"Copilot,":[86],"a":[87],"voice-first":[88],"human-AI":[89,145],"assistant":[90],"leverages":[95],"LMMs":[96],"enable":[98],"natural":[99],"speech-based":[100],"interaction.":[101],"The":[102],"system":[103],"supports":[104],"contextual":[105],"understanding":[106],"physical":[108],"environments,":[109],"realistic":[110],"3D":[111],"generation,":[113],"real-time":[115],"retrieval.":[117],"In":[118],"addition":[119],"in-headset":[121],"interaction,":[122],"Copilot":[124],"facilitates":[125],"cross-platform":[126],"workflows":[127],"by":[128],"generating":[129],"context-aware":[130],"textual":[131],"exporting":[134],"generated":[135],"assets.":[136],"This":[137],"work":[138],"explores":[139],"design":[141],"space":[142],"LMM-powered":[144],"collaboration":[146],"reality.":[149]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-13T00:00:00"}
