{"id":"https://openalex.org/W4415541204","doi":"https://doi.org/10.1145/3746027.3754887","title":"OpenMap: Instruction Grounding via Open-Vocabulary Visual-Language Mapping","display_name":"OpenMap: Instruction Grounding via Open-Vocabulary Visual-Language Mapping","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415541204","doi":"https://doi.org/10.1145/3746027.3754887"},"language":"en","primary_location":{"id":"doi:10.1145/3746027.3754887","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3754887","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746027.3754887","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100453910","display_name":"Danyang Li","orcid":"https://orcid.org/0000-0002-7527-4669"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Danyang Li","raw_affiliation_strings":["School of Software, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-7527-4669","affiliations":[{"raw_affiliation_string":"School of Software, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zenghui Yang","orcid":"https://orcid.org/0009-0008-1232-1190"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zenghui Yang","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China","School of Software, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-1232-1190","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]},{"raw_affiliation_string":"School of Software, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024560479","display_name":"Guangpeng Qi","orcid":null},"institutions":[{"id":"https://openalex.org/I4210144143","display_name":"Inspur (China)","ror":"https://ror.org/0474p4r72","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210144143"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangpeng Qi","raw_affiliation_strings":["Inspur Yunzhou Industrial Internet Co., Ltd, Shandong, China"],"raw_orcid":"https://orcid.org/0009-0006-4733-8975","affiliations":[{"raw_affiliation_string":"Inspur Yunzhou Industrial Internet Co., Ltd, Shandong, China","institution_ids":["https://openalex.org/I4210144143"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Songtao Pang","orcid":"https://orcid.org/0009-0004-4192-5568"},"institutions":[{"id":"https://openalex.org/I4210144143","display_name":"Inspur (China)","ror":"https://ror.org/0474p4r72","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210144143"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Songtao Pang","raw_affiliation_strings":["Inspur Yunzhou Industrial Internet Co., Ltd, Shandong, China"],"raw_orcid":"https://orcid.org/0009-0004-4192-5568","affiliations":[{"raw_affiliation_string":"Inspur Yunzhou Industrial Internet Co., Ltd, Shandong, China","institution_ids":["https://openalex.org/I4210144143"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114184244","display_name":"Guangyong Shang","orcid":"https://orcid.org/0009-0000-8571-3605"},"institutions":[{"id":"https://openalex.org/I4210144143","display_name":"Inspur (China)","ror":"https://ror.org/0474p4r72","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210144143"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangyong Shang","raw_affiliation_strings":["Inspur Yunzhou Industrial Internet Co., Ltd, Shandong, China"],"raw_orcid":"https://orcid.org/0009-0000-8571-3605","affiliations":[{"raw_affiliation_string":"Inspur Yunzhou Industrial Internet Co., Ltd, Shandong, China","institution_ids":["https://openalex.org/I4210144143"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100625585","display_name":"Qiang Ma","orcid":"https://orcid.org/0000-0001-5791-1890"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiang Ma","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5791-1890","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5061103594","display_name":"Zheng Yang","orcid":"https://orcid.org/0000-0003-4048-2684"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zheng Yang","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China","School of Software, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-4048-2684","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]},{"raw_affiliation_string":"School of Software, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.26245693,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"7444","last_page":"7452"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.9835000038146973,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9747999906539917,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.6552000045776367},{"id":"https://openalex.org/keywords/semantic-mapping","display_name":"Semantic mapping","score":0.6074000000953674},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.48910000920295715},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.4860999882221222},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4781000018119812},{"id":"https://openalex.org/keywords/spatial-contextual-awareness","display_name":"Spatial contextual awareness","score":0.46000000834465027},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4537000060081482},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.438400000333786},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.43149998784065247}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8169000148773193},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.6552000045776367},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.6074000000953674},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5029000043869019},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.48910000920295715},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.4860999882221222},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4781000018119812},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4634999930858612},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.46000000834465027},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4537000060081482},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.438400000333786},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.43149998784065247},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.4117000102996826},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.4088999927043915},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.39980000257492065},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.39410001039505005},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.384799987077713},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3808000087738037},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.35899999737739563},{"id":"https://openalex.org/C198942812","wikidata":"https://www.wikidata.org/wiki/Q496618","display_name":"Semantic property","level":2,"score":0.299699991941452},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.2935999929904938},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.28940001130104065},{"id":"https://openalex.org/C2780297707","wikidata":"https://www.wikidata.org/wiki/Q4895393","display_name":"Landmark","level":2,"score":0.2824999988079071},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.2777000069618225},{"id":"https://openalex.org/C511149849","wikidata":"https://www.wikidata.org/wiki/Q7449051","display_name":"Semantic computing","level":3,"score":0.27630001306533813},{"id":"https://openalex.org/C37926939","wikidata":"https://www.wikidata.org/wiki/Q7449061","display_name":"Semantic equivalence","level":4,"score":0.27489998936653137},{"id":"https://openalex.org/C202708506","wikidata":"https://www.wikidata.org/wiki/Q7449050","display_name":"Semantic compression","level":5,"score":0.2676999866962433},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.2667999863624573}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3746027.3754887","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3754887","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2508.01723","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2508.01723","pdf_url":"https://arxiv.org/pdf/2508.01723","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3746027.3754887","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3754887","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":3,"referenced_works":["https://openalex.org/W3047014196","https://openalex.org/W4392172801","https://openalex.org/W4402354022"],"related_works":[],"abstract_inverted_index":{"Grounding":[0,109],"natural":[1],"language":[2,40,159],"instructions":[3],"to":[4,47,94],"visual":[5],"observations":[6],"is":[7],"fundamental":[8],"for":[9,66,163],"embodied":[10,164],"agents":[11],"operating":[12],"in":[13,18,37,49,70,147,156],"open-world":[14],"environments.":[15],"Recent":[16],"advances":[17],"visual-language":[19,63],"mapping":[20,134],"have":[21],"enabled":[22],"generalizable":[23],"semantic":[24,52,75,133],"representations":[25],"by":[26,116],"leveraging":[27],"visionlanguage":[28],"models":[29],"(VLMs).":[30],"However,":[31],"these":[32],"methods":[33],"often":[34],"fall":[35],"short":[36],"aligning":[38],"free-form":[39,158],"commands":[41],"with":[42],"specific":[43],"scene":[44],"instances,":[45],"due":[46],"limitations":[48],"both":[50,132],"instance-level":[51],"consistency":[53],"and":[54,91,120,129,135,160],"instruction":[55,68,102],"interpretation.":[56],"We":[57,124],"present":[58],"OpenMap,":[59],"a":[60,81],"zero-shot":[61,148],"open-vocabulary":[62],"map":[64],"designed":[65],"accurate":[67],"grounding":[69],"navigation":[71],"tasks.":[72,138],"To":[73,100],"address":[74],"inconsistencies":[76],"across":[77],"views,":[78],"we":[79,104],"introduce":[80],"Structural-Semantic":[82],"Consensus":[83],"constraint":[84],"that":[85,111,142],"jointly":[86],"considers":[87],"global":[88],"geometric":[89],"structure":[90],"vision-language":[92],"similarity":[93],"guide":[95],"robust":[96],"3D":[97,161],"instancelevel":[98],"aggregation.":[99],"improve":[101],"interpretation,":[103],"propose":[105],"an":[106],"LLM-assisted":[107],"Instruction-to-Instance":[108],"module":[110],"enables":[112],"fine-grained":[113],"instance":[114],"selection":[115],"incorporating":[117],"spatial":[118],"context":[119],"expressive":[121],"target":[122],"descriptions.":[123],"evaluate":[125],"OpenMap":[126,143],"on":[127],"ScanNet200":[128],"Matterport3D,":[130],"covering":[131],"instruction-to-target":[136],"retrieval":[137],"Experimental":[139],"results":[140],"show":[141],"outperforms":[144],"state-of-the-art":[145],"baselines":[146],"settings,":[149],"demonstrating":[150],"the":[151],"effectiveness":[152],"of":[153],"our":[154],"method":[155],"bridging":[157],"perception":[162],"navigation.":[165]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-25T00:00:00"}
