{"id":"https://openalex.org/W7131122177","doi":"https://doi.org/10.48550/arxiv.2602.18424","title":"CapNav: Benchmarking Vision Language Models on Capability-conditioned Indoor Navigation","display_name":"CapNav: Benchmarking Vision Language Models on Capability-conditioned Indoor Navigation","publication_year":2026,"publication_date":"2026-02-20","ids":{"openalex":"https://openalex.org/W7131122177","doi":"https://doi.org/10.48550/arxiv.2602.18424"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.18424","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126600711","display_name":"Xia Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Su, Xia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126653309","display_name":"Ruiqi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Ruiqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067119790","display_name":"Benlin Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Benlin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126647739","display_name":"Jingwei Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Jingwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014510735","display_name":"Zonglin Di","orcid":"https://orcid.org/0000-0002-3875-2676"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Di, Zonglin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126585590","display_name":"Ranjay Krishna","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Krishna, Ranjay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5016828530","display_name":"Jon E. Froehlich","orcid":"https://orcid.org/0000-0001-8291-3353"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Froehlich, Jon","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5126600711"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9544000029563904,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9544000029563904,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.008200000040233135,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.006099999882280827,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/traverse","display_name":"Traverse","score":0.8252999782562256},{"id":"https://openalex.org/keywords/obstacle","display_name":"Obstacle","score":0.6908000111579895},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5990999937057495},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.5647000074386597},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.5023000240325928},{"id":"https://openalex.org/keywords/mobile-robot-navigation","display_name":"Mobile robot navigation","score":0.49799999594688416},{"id":"https://openalex.org/keywords/navigation-system","display_name":"Navigation system","score":0.3797000050544739},{"id":"https://openalex.org/keywords/turn-by-turn-navigation","display_name":"Turn-by-turn navigation","score":0.3483000099658966}],"concepts":[{"id":"https://openalex.org/C176809094","wikidata":"https://www.wikidata.org/wiki/Q15401496","display_name":"Traverse","level":2,"score":0.8252999782562256},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.6908000111579895},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6693000197410583},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5990999937057495},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.5647000074386597},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.5023000240325928},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5009999871253967},{"id":"https://openalex.org/C26990112","wikidata":"https://www.wikidata.org/wiki/Q6887224","display_name":"Mobile robot navigation","level":5,"score":0.49799999594688416},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4578999876976013},{"id":"https://openalex.org/C2777891301","wikidata":"https://www.wikidata.org/wiki/Q3475123","display_name":"Navigation system","level":2,"score":0.3797000050544739},{"id":"https://openalex.org/C43472768","wikidata":"https://www.wikidata.org/wiki/Q7855620","display_name":"Turn-by-turn navigation","level":5,"score":0.3483000099658966},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3474000096321106},{"id":"https://openalex.org/C6683253","wikidata":"https://www.wikidata.org/wiki/Q7075535","display_name":"Obstacle avoidance","level":4,"score":0.34299999475479126},{"id":"https://openalex.org/C44154836","wikidata":"https://www.wikidata.org/wiki/Q45045","display_name":"Simulation","level":1,"score":0.3172000050544739},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.31619998812675476},{"id":"https://openalex.org/C19966478","wikidata":"https://www.wikidata.org/wiki/Q4810574","display_name":"Mobile robot","level":3,"score":0.299699991941452},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.2955999970436096},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2948000133037567},{"id":"https://openalex.org/C81074085","wikidata":"https://www.wikidata.org/wiki/Q366872","display_name":"Motion planning","level":3,"score":0.2858000099658966},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.2526000142097473},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.18424","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.18424","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.18424","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.18424","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0,8],"Models":[1],"(VLMs)":[2],"have":[3],"shown":[4],"remarkable":[5],"progress":[6],"in":[7,169],"Navigation":[9,52],"(VLN),":[10],"offering":[11],"new":[12],"possibilities":[13],"for":[14,158,164],"navigation":[15,28,101,129,160],"decision-making":[16],"that":[17,126,138,146],"could":[18],"benefit":[19],"both":[20],"robotic":[21],"platforms":[22],"and":[23,72,80,90,103,124,137,161],"human":[24,79],"users.":[25],"However,":[26],"real-world":[27,97],"is":[29,174],"inherently":[30],"conditioned":[31],"by":[32,154],"the":[33,156,162],"agent's":[34,69],"mobility":[35,88,134],"constraints.":[36],"For":[37],"example,":[38],"a":[39,46,54],"sweeping":[40],"robot":[41,81],"cannot":[42],"traverse":[43,112],"stairs,":[44],"while":[45],"quadruped":[47],"can.":[48],"We":[49,119,152],"introduce":[50],"Capability-Conditioned":[51],"(CapNav),":[53],"benchmark":[55,173],"designed":[56],"to":[57,107],"evaluate":[58,120],"how":[59],"well":[60],"VLMs":[61,110,123],"can":[62,111],"navigate":[63],"complex":[64],"indoor":[65,98,113],"spaces":[66],"given":[67],"an":[68],"specific":[70],"physical":[71,86],"operational":[73],"capabilities.":[74,118],"CapNav":[75,94],"defines":[76],"five":[77],"representative":[78],"agents,":[82],"each":[83],"described":[84],"with":[85,143],"dimensions,":[87],"capabilities,":[89],"environmental":[91],"interaction":[92],"abilities.":[93],"provides":[95],"45":[96],"scenes,":[99],"473":[100],"tasks,":[102],"2365":[104],"QA":[105],"pairs":[106],"test":[108],"if":[109],"environments":[114],"based":[115],"on":[116,149],"agent":[117],"13":[121],"modern":[122],"find":[125],"current":[127],"VLM's":[128],"performance":[130],"drops":[131],"sharply":[132],"as":[133],"constraints":[135],"tighten,":[136],"even":[139],"state-of-the-art":[140],"models":[141],"struggle":[142],"obstacle":[144],"types":[145],"require":[147],"reasoning":[148,168],"spatial":[150,167],"dimensions.":[151],"conclude":[153],"discussing":[155],"implications":[157],"capability-aware":[159],"opportunities":[163],"advancing":[165],"embodied":[166],"future":[170],"VLMs.":[171],"The":[172],"available":[175],"at":[176],"https://github.com/makeabilitylab/CapNav":[177]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-24T00:00:00"}
