{"id":"https://openalex.org/W7139966614","doi":"https://doi.org/10.48550/arxiv.2603.18892","title":"MultihopSpatial: Multi-hop Compositional Spatial Reasoning Benchmark for Vision-Language Model","display_name":"MultihopSpatial: Multi-hop Compositional Spatial Reasoning Benchmark for Vision-Language Model","publication_year":2026,"publication_date":"2026-03-19","ids":{"openalex":"https://openalex.org/W7139966614","doi":"https://doi.org/10.48550/arxiv.2603.18892"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.18892","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18892","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.18892","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130225803","display_name":"Youngwan Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Youngwan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013982522","display_name":"Soojin Jang","orcid":"https://orcid.org/0000-0002-2719-7646"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jang, Soojin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130250832","display_name":"Yoorhim Cho","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cho, Yoorhim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100410191","display_name":"Seunghwan Lee","orcid":"https://orcid.org/0000-0003-0342-5381"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Seunghwan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101403699","display_name":"Yong-Ju Lee","orcid":"https://orcid.org/0000-0001-9308-233X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Yong-Ju","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130243509","display_name":"Sung Ju Hwang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hwang, Sung Ju","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9918000102043152,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9918000102043152,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.0010000000474974513,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0008999999845400453,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.7645999789237976},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.7254999876022339},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5986999869346619},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5565000176429749},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5554999709129333},{"id":"https://openalex.org/keywords/qualitative-reasoning","display_name":"Qualitative reasoning","score":0.5184999704360962},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.5095000267028809},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.43650001287460327},{"id":"https://openalex.org/keywords/non-monotonic-logic","display_name":"Non-monotonic logic","score":0.3953999876976013}],"concepts":[{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.7645999789237976},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.7254999876022339},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7232000231742859},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.605400025844574},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5986999869346619},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5565000176429749},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5554999709129333},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.5184999704360962},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.5095000267028809},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.43650001287460327},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.3953999876976013},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.38019999861717224},{"id":"https://openalex.org/C166088908","wikidata":"https://www.wikidata.org/wiki/Q308495","display_name":"Abductive reasoning","level":2,"score":0.37959998846054077},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.37689998745918274},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.3587000072002411},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.35199999809265137},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.3409000039100647},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34040001034736633},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.3262999951839447},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.3089999854564667},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.3057999908924103},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.29510000348091125},{"id":"https://openalex.org/C2777371692","wikidata":"https://www.wikidata.org/wiki/Q2178611","display_name":"Spatial cognition","level":3,"score":0.29010000824928284},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.25679999589920044},{"id":"https://openalex.org/C97364631","wikidata":"https://www.wikidata.org/wiki/Q484284","display_name":"Deductive reasoning","level":2,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.18892","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18892","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.18892","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18892","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Spatial":[0],"reasoning":[1,31,79,125,145],"is":[2],"foundational":[3],"for":[4,37,55,96],"Vision-Language":[5],"Models":[6],"(VLMs),":[7],"particularly":[8],"when":[9],"deployed":[10],"as":[11],"Vision-Language-Action":[12],"(VLA)":[13],"agents":[14],"in":[15],"physical":[16],"environments.":[17],"However,":[18],"existing":[19],"benchmarks":[20],"predominantly":[21],"focus":[22],"on":[23,137],"elementary,":[24],"single-hop":[25],"relations,":[26],"neglecting":[27],"the":[28],"multi-hop":[29,56],"compositional":[30,58,123],"and":[32,57,80,88,146],"precise":[33,89],"visual":[34,81],"grounding":[35,82],"essential":[36],"real-world":[38],"scenarios.":[39],"To":[40],"address":[41],"this,":[42],"we":[43,131],"introduce":[44],"MultihopSpatial,":[45],"offering":[46],"three":[47],"key":[48,119],"contributions:":[49],"(1)":[50],"A":[51],"comprehensive":[52],"benchmark":[53],"designed":[54],"spatial":[59,69,109,124,144],"reasoning,":[60],"featuring":[61],"1-":[62],"to":[63,107],"3-hop":[64],"complex":[65],"queries":[66],"across":[67],"diverse":[68],"perspectives.":[70],"(2)":[71],"Acc@50IoU,":[72],"a":[73,102,127],"complementary":[74],"metric":[75],"that":[76,122,133],"simultaneously":[77],"evaluates":[78],"by":[83],"requiring":[84],"both":[85,141],"answer":[86],"selection":[87],"bounding":[90],"box":[91],"prediction":[92],"-":[93],"capabilities":[94],"vital":[95],"robust":[97],"VLA":[98],"deployment.":[99],"(3)":[100],"MultihopSpatial-Train,":[101],"dedicated":[103],"large-scale":[104],"training":[105],"corpus":[106,139],"foster":[108],"intelligence.":[110],"Extensive":[111],"evaluation":[112],"of":[113],"37":[114],"state-of-the-art":[115],"VLMs":[116],"yields":[117],"eight":[118],"insights,":[120],"revealing":[121],"remains":[126],"formidable":[128],"challenge.":[129],"Finally,":[130],"demonstrate":[132],"reinforcement":[134],"learning":[135],"post-training":[136],"our":[138],"enhances":[140],"intrinsic":[142],"VLM":[143],"downstream":[147],"embodied":[148],"manipulation":[149],"performance.":[150]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-21T00:00:00"}
