{"id":"https://openalex.org/W4416749631","doi":"https://doi.org/10.1109/iros60139.2025.11246684","title":"WMNav: Integrating Vision-Language Models into World Models for Object Goal Navigation","display_name":"WMNav: Integrating Vision-Language Models into World Models for Object Goal Navigation","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416749631","doi":"https://doi.org/10.1109/iros60139.2025.11246684"},"language":null,"primary_location":{"id":"doi:10.1109/iros60139.2025.11246684","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11246684","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113273861","display_name":"Dujun Nie","orcid":null},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Dujun Nie","raw_affiliation_strings":["Chinese Academy of Sciences,Institute of Automation"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Institute of Automation","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022357888","display_name":"Xianda Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xianda Guo","raw_affiliation_strings":["Wuhan University,School of Computer Science"],"affiliations":[{"raw_affiliation_string":"Wuhan University,School of Computer Science","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015429091","display_name":"Yiqun Duan","orcid":null},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Yiqun Duan","raw_affiliation_strings":["University of Technology Sydney,School of Computer Science"],"affiliations":[{"raw_affiliation_string":"University of Technology Sydney,School of Computer Science","institution_ids":["https://openalex.org/I114017466"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100660644","display_name":"Ruijun Zhang","orcid":"https://orcid.org/0000-0001-6869-8861"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruijun Zhang","raw_affiliation_strings":["Chinese Academy of Sciences,Institute of Automation"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Institute of Automation","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102813447","display_name":"Long Chen","orcid":"https://orcid.org/0009-0004-6764-3403"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Chen","raw_affiliation_strings":["Chinese Academy of Sciences,Institute of Automation"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Institute of Automation","institution_ids":["https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5113273861"],"corresponding_institution_ids":["https://openalex.org/I19820366"],"apc_list":null,"apc_paid":null,"fwci":5.9594,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.96566241,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"2392","last_page":"2399"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.868399977684021,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.868399977684021,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.02800000086426735,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.010599999688565731,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6075000166893005},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5501999855041504},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.5461000204086304},{"id":"https://openalex.org/keywords/curiosity","display_name":"Curiosity","score":0.5429999828338623},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.4602000117301941},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.44449999928474426},{"id":"https://openalex.org/keywords/plan","display_name":"Plan (archaeology)","score":0.4185999929904938},{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.38909998536109924}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6916000247001648},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6075000166893005},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5641999840736389},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5501999855041504},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.5461000204086304},{"id":"https://openalex.org/C33435437","wikidata":"https://www.wikidata.org/wiki/Q366791","display_name":"Curiosity","level":2,"score":0.5429999828338623},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4641000032424927},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.4602000117301941},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.44449999928474426},{"id":"https://openalex.org/C2776505523","wikidata":"https://www.wikidata.org/wiki/Q4785468","display_name":"Plan (archaeology)","level":2,"score":0.4185999929904938},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.38909998536109924},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3650999963283539},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.288100004196167},{"id":"https://openalex.org/C2776291640","wikidata":"https://www.wikidata.org/wiki/Q2912517","display_name":"Value (mathematics)","level":2,"score":0.28439998626708984},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2786000072956085},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2678000032901764},{"id":"https://openalex.org/C2780366209","wikidata":"https://www.wikidata.org/wiki/Q5170200","display_name":"Core model","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.25060001015663147},{"id":"https://openalex.org/C2780210234","wikidata":"https://www.wikidata.org/wiki/Q422638","display_name":"Action plan","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iros60139.2025.11246684","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11246684","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1980035368","https://openalex.org/W2964339842","https://openalex.org/W3038822267","https://openalex.org/W3210395242","https://openalex.org/W3211462570","https://openalex.org/W4312707839","https://openalex.org/W4312929500","https://openalex.org/W4312956471","https://openalex.org/W4385416186","https://openalex.org/W4386083015","https://openalex.org/W4389666115","https://openalex.org/W4390874575","https://openalex.org/W4401042437","https://openalex.org/W4401417061","https://openalex.org/W4401417236","https://openalex.org/W4402754246","https://openalex.org/W4404612908","https://openalex.org/W4404770192"],"related_works":[],"abstract_inverted_index":{"Object":[0],"Goal":[1],"Navigation-requiring":[2],"an":[3,11],"agent":[4],"to":[5,87,90,117,127],"locate":[6],"a":[7,14,41,67,128,161],"specific":[8],"object":[9],"in":[10,17,23,184],"unseen":[12],"environment-remains":[13],"core":[15],"challenge":[16],"embodied":[18],"AI.":[19],"Although":[20],"recent":[21],"progress":[22],"Vision-Language":[24,75],"Model":[25],"(VLM)-based":[26],"agents":[27],"has":[28,38],"demonstrated":[29],"promising":[30],"perception":[31],"and":[32,50,84,153,176,188,195,202],"decision-making":[33],"abilities":[34],"through":[35],"prompting,":[36],"none":[37],"yet":[39],"established":[40],"fully":[42],"modular":[43],"world":[44,114,150],"model":[45,115,138,151],"design":[46],"that":[47],"reduces":[48],"risky":[49],"costly":[51],"interactions":[52],"with":[53],"the":[54,58,62,91,96,100,104,113,135,145,149],"environment":[55],"by":[56,74,140,169],"predicting":[57],"future":[59],"state":[60,98],"of":[61,82,99,112,137],"world.":[63],"We":[64],"introduce":[65],"WMNav,":[66],"novel":[68],"World":[69],"Model-based":[70],"Navigation":[71],"framework":[72],"powered":[73],"Models":[76],"(VLMs).":[77],"It":[78],"predicts":[79],"possible":[80],"outcomes":[81],"decisions":[83,142],"builds":[85],"memories":[86],"provide":[88,118],"feedback":[89,146],"policy":[92],"module.":[93],"To":[94,155],"retain":[95],"predicted":[97],"environment,":[101],"WMNav":[102,132,179],"proposes":[103],"online":[105],"maintained":[106],"Curiosity":[107],"Value":[108],"Map":[109],"as":[110],"part":[111],"memory":[116],"dynamic":[119],"configuration":[120],"for":[121],"navigation":[122],"policy.":[123],"By":[124],"decomposing":[125],"according":[126],"human-like":[129],"thinking":[130],"process,":[131],"effectively":[133],"alleviates":[134],"impact":[136],"hallucination":[139],"making":[141],"based":[143],"on":[144,174,198,205],"difference":[147],"between":[148],"plan":[152],"observation.":[154],"further":[156],"boost":[157],"efficiency,":[158],"we":[159],"implement":[160],"two-stage":[162],"action":[163],"proposer":[164],"strategy:":[165],"broad":[166],"exploration":[167,189],"followed":[168],"precise":[170],"localization.":[171],"Extensive":[172],"evaluation":[173],"HM3D":[175],"MP3D":[177],"validates":[178],"surpasses":[180],"existing":[181],"zero-shot":[182],"benchmarks":[183],"both":[185],"success":[186],"rate":[187],"efficiency":[190],"(absolute":[191],"improvement:":[192],"+3.2%":[193,196],"SR":[194,201],"SPL":[197,204],"HM3D,":[199],"+13.5%":[200],"+1.1%":[203],"MP3D).":[206],"Project":[207],"page:":[208],"https://b0b8k1ng.github.io/WMNav/.":[209]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-11-28T00:00:00"}
