{"id":"https://openalex.org/W7138022109","doi":"https://doi.org/10.1609/aaai.v40i15.38258","title":"What You See Is What You Reach: Towards Spatial Navigation with High-Level Human Instructions","display_name":"What You See Is What You Reach: Towards Spatial Navigation with High-Level Human Instructions","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138022109","doi":"https://doi.org/10.1609/aaai.v40i15.38258"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i15.38258","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i15.38258","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/38258/42220","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/38258/42220","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129672682","display_name":"Lingfeng Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lingfeng Zhang","raw_affiliation_strings":["Tsinghua Shenzhen International Graduate School, Tsinghua University\nPeng Cheng Laboratory\nXiaomi EV"],"affiliations":[{"raw_affiliation_string":"Tsinghua Shenzhen International Graduate School, Tsinghua University\nPeng Cheng Laboratory\nXiaomi EV","institution_ids":["https://openalex.org/I4210114105","https://openalex.org/I3131625388"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022646224","display_name":"Haoxiang Fu","orcid":null},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Haoxiang Fu","raw_affiliation_strings":["National University of Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129644286","display_name":"Xiaoshuai Hao","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoshuai Hao","raw_affiliation_strings":["Xiaomi EV"],"affiliations":[{"raw_affiliation_string":"Xiaomi EV","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123329394","display_name":"Shuyi Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210157642","display_name":"Institute of Automation","ror":"https://ror.org/056qj1t15","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210157642","https://openalex.org/I78650965"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Shuyi Zhang","raw_affiliation_strings":["Institute of Automation, CAS"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, CAS","institution_ids":["https://openalex.org/I4210157642"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129662465","display_name":"Qiang Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]},{"id":"https://openalex.org/I929597975","display_name":"National University of Sciences and Technology","ror":"https://ror.org/03w2j5y17","country_code":"PK","type":"education","lineage":["https://openalex.org/I929597975"]}],"countries":["HK","PK"],"is_corresponding":false,"raw_author_name":"Qiang Zhang","raw_affiliation_strings":["HKUSTGZ"],"affiliations":[{"raw_affiliation_string":"HKUSTGZ","institution_ids":["https://openalex.org/I929597975","https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129704254","display_name":"Rui Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I2722730","display_name":"Inner Mongolia University","ror":"https://ror.org/0106qb496","country_code":"CN","type":"education","lineage":["https://openalex.org/I2722730"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Liu","raw_affiliation_strings":["Inner Mongolia University"],"affiliations":[{"raw_affiliation_string":"Inner Mongolia University","institution_ids":["https://openalex.org/I2722730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129713475","display_name":"Long Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Chen","raw_affiliation_strings":["Xiaomi EV"],"affiliations":[{"raw_affiliation_string":"Xiaomi EV","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129649585","display_name":"Wenbo Ding","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenbo Ding","raw_affiliation_strings":["Tsinghua Shenzhen International Graduate School, Tsinghua University"],"affiliations":[{"raw_affiliation_string":"Tsinghua Shenzhen International Graduate School, Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":5,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5129672682"],"corresponding_institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I4210114105"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.30820896,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"15","first_page":"12627","last_page":"12635"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.843500018119812,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.843500018119812,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.03629999980330467,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.01510000042617321,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5220000147819519},{"id":"https://openalex.org/keywords/mobile-robot-navigation","display_name":"Mobile robot navigation","score":0.5203999876976013},{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.515999972820282},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.5094000101089478},{"id":"https://openalex.org/keywords/navigation-system","display_name":"Navigation system","score":0.5059000253677368},{"id":"https://openalex.org/keywords/spatial-contextual-awareness","display_name":"Spatial contextual awareness","score":0.49320000410079956},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4510999917984009},{"id":"https://openalex.org/keywords/turn-by-turn-navigation","display_name":"Turn-by-turn navigation","score":0.4510999917984009}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7329000234603882},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.565500020980835},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5220000147819519},{"id":"https://openalex.org/C26990112","wikidata":"https://www.wikidata.org/wiki/Q6887224","display_name":"Mobile robot navigation","level":5,"score":0.5203999876976013},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5170999765396118},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.515999972820282},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.5094000101089478},{"id":"https://openalex.org/C2777891301","wikidata":"https://www.wikidata.org/wiki/Q3475123","display_name":"Navigation system","level":2,"score":0.5059000253677368},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.49320000410079956},{"id":"https://openalex.org/C43472768","wikidata":"https://www.wikidata.org/wiki/Q7855620","display_name":"Turn-by-turn navigation","level":5,"score":0.4510999917984009},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4510999917984009},{"id":"https://openalex.org/C43729271","wikidata":"https://www.wikidata.org/wiki/Q3560550","display_name":"Spatial memory","level":4,"score":0.4278999865055084},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4162999987602234},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.37389999628067017},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3610999882221222},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.35409998893737793},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.3521000146865845},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.3294999897480011},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3156000077724457},{"id":"https://openalex.org/C2780297707","wikidata":"https://www.wikidata.org/wiki/Q4895393","display_name":"Landmark","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C2986522900","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relationship","level":2,"score":0.25780001282691956}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i15.38258","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i15.38258","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/38258/42220","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i15.38258","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i15.38258","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/38258/42220","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138022109.pdf","grobid_xml":"https://content.openalex.org/works/W7138022109.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Embodied":[0],"navigation":[1,29,58,69,77,115,133,152,168,175,202,237],"is":[2],"a":[3,22,53,131,155,173,204],"fundamental":[4],"capability":[5],"that":[6,229],"enables":[7],"embodied":[8,28,57],"agents":[9,87],"to":[10,39,81,88,103,158,183,211],"effectively":[11,214],"interact":[12],"with":[13,44],"the":[14,140,166,196,208,220,246],"physical":[15],"world":[16],"in":[17,121,215,235],"various":[18],"complex":[19,107,216],"environments.":[20,108],"However,":[21],"significant":[23],"gap":[24,221],"remains":[25],"between":[26,222],"current":[27],"tasks":[30,238],"and":[31,74,95,150,162,188,206,224,242],"real-world":[32,122,243],"requirements,":[33],"as":[34],"existing":[35],"methods":[36],"often":[37],"struggle":[38],"integrate":[40],"high-level":[41,145,185],"human":[42,146,186],"instructions":[43,187],"spatial":[45,60,67,75,93,132,167,236],"understanding.":[46],"To":[47,124],"address":[48],"this":[49,126],"gap,":[50],"we":[51,128,170],"propose":[52],"new":[54],"task":[55],"of":[56,136,248],"called":[59],"navigation,":[61],"which":[62],"encompasses":[63],"two":[64],"key":[65],"components:":[66],"object":[68],"(SpON)":[70],"for":[71,79],"object-specific":[72],"guidance":[73],"area":[76],"(SpAN)":[78],"navigating":[80,102],"designated":[82],"areas.":[83],"Specifically,":[84,177],"SpON":[85],"guides":[86],"specific":[89],"objects":[90,192],"by":[91,218],"leveraging":[92],"relationships":[94],"contextual":[96],"understanding,":[97],"while":[98],"SpAN":[99],"focuses":[100],"on":[101,165],"defined":[104],"areas":[105,194],"within":[106,139,195],"Together,":[109],"these":[110],"components":[111],"significantly":[112],"enhance":[113,159],"agents\u2019":[114],"capabilities,":[116],"enabling":[117],"more":[118],"effective":[119],"interactions":[120],"scenarios.":[123],"support":[125],"task,":[127],"have":[129],"generated":[130],"dataset":[134,143],"consisting":[135],"10K":[137],"trajectories":[138],"simulator.":[141],"This":[142],"includes":[144],"instructions,":[147],"detailed":[148],"observations,":[149],"corresponding":[151],"actions,":[153],"providing":[154],"comprehensive":[156],"resource":[157],"agent":[160],"training":[161],"performance.":[163],"Building":[164],"dataset,":[169],"introduce":[171],"SpNav,":[172],"hierarchical":[174],"framework.":[176],"SpNav":[178,230],"employs":[179],"vision-language":[180],"model":[181],"(VLM)":[182],"interpret":[184],"accurately":[189],"identify":[190],"goal":[191],"or":[193],"observation":[197],"range,":[198],"achieving":[199],"precise":[200],"point-to-point":[201],"using":[203],"map":[205],"enhancing":[207],"agent\u2019s":[209],"ability":[210],"oper-":[212],"ate":[213],"environments":[217],"bridging":[219],"perception":[223],"action.":[225],"Extensive":[226],"experiments":[227],"show":[228],"achieves":[231],"state-of-the-art":[232],"(SOTA)":[233],"performance":[234],"across":[239],"both":[240],"simulated":[241],"environments,":[244],"validating":[245],"effectiveness":[247],"our":[249],"method.":[250]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
