{"id":"https://openalex.org/W7133325074","doi":"https://doi.org/10.48550/arxiv.2603.00600","title":"I-Perceive: A Foundation Model for Active Perception with Language Instructions","display_name":"I-Perceive: A Foundation Model for Active Perception with Language Instructions","publication_year":2026,"publication_date":"2026-02-28","ids":{"openalex":"https://openalex.org/W7133325074","doi":"https://doi.org/10.48550/arxiv.2603.00600"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00600","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00600","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00600","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5020210924","display_name":"Yongxi Huang","orcid":"https://orcid.org/0000-0002-8000-3070"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yongxi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127896761","display_name":"Zhuohang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhuohang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128026902","display_name":"Wenjing Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Wenjing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107334651","display_name":"CeWu Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Cewu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5077008575","display_name":"Panpan Cai","orcid":"https://orcid.org/0000-0003-1926-4842"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Panpan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9089000225067139,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9089000225067139,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.02850000001490116,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.011599999852478504,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/active-perception","display_name":"Active perception","score":0.7401000261306763},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.6266999840736389},{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.571399986743927},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5453000068664551},{"id":"https://openalex.org/keywords/active-vision","display_name":"Active vision","score":0.47929999232292175},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.4390999972820282},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.41519999504089355}],"concepts":[{"id":"https://openalex.org/C2776010242","wikidata":"https://www.wikidata.org/wiki/Q4677575","display_name":"Active perception","level":3,"score":0.7401000261306763},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6790000200271606},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.6266999840736389},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.571399986743927},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5453000068664551},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5317000150680542},{"id":"https://openalex.org/C193611912","wikidata":"https://www.wikidata.org/wiki/Q4677596","display_name":"Active vision","level":2,"score":0.47929999232292175},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.4390999972820282},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.42239999771118164},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.41519999504089355},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.38769999146461487},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3828999996185303},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.3619999885559082},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.351500004529953},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3269999921321869},{"id":"https://openalex.org/C77967617","wikidata":"https://www.wikidata.org/wiki/Q4677561","display_name":"Active learning (machine learning)","level":2,"score":0.26420000195503235},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.262800008058548},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.2581999897956848}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00600","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00600","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00600","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00600","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.47139695286750793,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Active":[0],"perception,":[1],"the":[2],"ability":[3],"of":[4,168],"a":[5,72,106,112,133],"robot":[6],"to":[7,12,39,177],"proactively":[8],"adjust":[9],"its":[10],"viewpoint":[11],"acquire":[13],"task-relevant":[14],"information,":[15],"is":[16],"essential":[17],"for":[18,27,75,84,126],"robust":[19],"operation":[20],"in":[21,58,161],"unstructured":[22],"real-world":[23,137],"environments.":[24,89],"While":[25],"critical":[26],"downstream":[28],"tasks":[29],"such":[30],"as":[31],"manipulation,":[32],"existing":[33],"approaches":[34],"have":[35],"largely":[36],"been":[37],"confined":[38],"local":[40],"settings":[41],"(e.g.,":[42,49],"table-top":[43],"scenes)":[44],"with":[45,55,111],"fixed":[46],"perception":[47,54,77],"objectives":[48],"occlusion":[50],"reduction).":[51],"Addressing":[52],"active":[53,76,127],"open-ended":[56,96],"intents":[57],"large-scale":[59],"environments":[60],"remains":[61],"an":[62,146],"open":[63],"challenge.":[64],"To":[65],"bridge":[66],"this":[67],"gap,":[68],"we":[69],"propose":[70],"I-Perceive,":[71],"foundation":[73,114],"model":[74],"conditioned":[78],"on":[79,100,132],"natural":[80],"language":[81,97],"instructions,":[82,98],"designed":[83],"mobile":[85],"manipulators":[86],"and":[87,119,140,148,165,172,180],"indoor":[88],"I-Perceive":[90,116,131,156],"predicts":[91],"camera":[92,170],"views":[93],"that":[94,155],"follows":[95],"based":[99],"image-based":[101],"scene":[102],"contexts.":[103],"By":[104],"fusing":[105],"Vision-Language":[107],"Model":[108],"(VLM)":[109],"backbone":[110],"geometric":[113,120],"model,":[115],"bridges":[117],"semantic":[118],"understanding,":[121],"thus":[122],"enabling":[123],"effective":[124],"reasoning":[125],"perception.":[128],"We":[129],"train":[130],"diverse":[134],"dataset":[135],"comprising":[136],"scene-scanning":[138],"data":[139,150],"simulation":[141],"data,":[142],"both":[143,162],"processed":[144],"via":[145],"automated":[147],"scalable":[149],"generation":[151],"pipeline.":[152],"Experiments":[153],"demonstrate":[154],"significantly":[157],"outperforms":[158],"state-of-the-art":[159],"VLMs":[160],"prediction":[163],"accuracy":[164],"instruction":[166],"following":[167],"generated":[169],"views,":[171],"exhibits":[173],"strong":[174],"zero-shot":[175],"generalization":[176],"novel":[178],"scenes":[179],"tasks.":[181]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-04T00:00:00"}
