{"id":"https://openalex.org/W7162284620","doi":"https://doi.org/10.48550/arxiv.2605.23898","title":"SPACENUM: Revisiting Spatial Numerical Understanding in VLMs","display_name":"SPACENUM: Revisiting Spatial Numerical Understanding in VLMs","publication_year":2026,"publication_date":"2026-05-22","ids":{"openalex":"https://openalex.org/W7162284620","doi":"https://doi.org/10.48550/arxiv.2605.23898"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.23898","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.23898","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.23898","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136961190","display_name":"Jianshu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jianshu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136967333","display_name":"Yijiang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yijiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025218141","display_name":"Huifeixin Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Huifeixin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136913893","display_name":"Haoran Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Haoran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123548031","display_name":"Letian Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Letian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136931670","display_name":"Bingyang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Bingyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136923055","display_name":"Han Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.43529999256134033,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.43529999256134033,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.22939999401569366,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.16869999468326569,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.5527999997138977},{"id":"https://openalex.org/keywords/trace","display_name":"TRACE (psycholinguistics)","score":0.46209999918937683},{"id":"https://openalex.org/keywords/current","display_name":"Current (fluid)","score":0.4381999969482422},{"id":"https://openalex.org/keywords/spatial-ecology","display_name":"Spatial ecology","score":0.43389999866485596},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.426800012588501},{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.4156000018119812},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.4138999879360199},{"id":"https://openalex.org/keywords/numerical-analysis","display_name":"Numerical analysis","score":0.36079999804496765}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5631999969482422},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.5527999997138977},{"id":"https://openalex.org/C75291252","wikidata":"https://www.wikidata.org/wiki/Q1315756","display_name":"TRACE (psycholinguistics)","level":2,"score":0.46209999918937683},{"id":"https://openalex.org/C148043351","wikidata":"https://www.wikidata.org/wiki/Q4456944","display_name":"Current (fluid)","level":2,"score":0.4381999969482422},{"id":"https://openalex.org/C158709400","wikidata":"https://www.wikidata.org/wiki/Q3578586","display_name":"Spatial ecology","level":2,"score":0.43389999866485596},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.426800012588501},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.4156000018119812},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.4138999879360199},{"id":"https://openalex.org/C48753275","wikidata":"https://www.wikidata.org/wiki/Q11216","display_name":"Numerical analysis","level":2,"score":0.36079999804496765},{"id":"https://openalex.org/C2780876879","wikidata":"https://www.wikidata.org/wiki/Q3054749","display_name":"Meaning (existential)","level":2,"score":0.3513999879360199},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.34130001068115234},{"id":"https://openalex.org/C500300565","wikidata":"https://www.wikidata.org/wiki/Q925667","display_name":"Computer simulation","level":2,"score":0.3269999921321869},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.323199987411499},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.32190001010894775},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.29739999771118164},{"id":"https://openalex.org/C150060386","wikidata":"https://www.wikidata.org/wiki/Q7574054","display_name":"Spatial correlation","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.2662000060081482},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C138695830","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial dependence","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.23898","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.23898","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.23898","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.23898","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"(VLMs)":[2],"are":[3,34],"increasingly":[4],"deployed":[5],"in":[6,37,41,71,106,124],"embodied":[7],"environments,":[8],"where":[9],"they":[10],"need":[11],"produce":[12],"numerical":[13,32,47,94,104,186],"outputs":[14,33],"such":[15],"as":[16,60,68],"action":[17],"magnitudes":[18],"and":[19,66,80,92,112,127,140,160,188],"spatial":[20,38,46,64,72,90,107,125,152,165,185,192],"coordinates.":[21],"Although":[22],"these":[23,31],"numbers":[24,59,67,123],"appear":[25],"meaningful,":[26],"it":[27],"remains":[28],"unclear":[29],"whether":[30,99],"genuinely":[35],"grounded":[36],"perception.":[39],"Therefore,":[40],"this":[42],"work,":[43],"we":[44,115,143],"revisit":[45],"understanding":[48,187],"through":[49],"SpaceNum,":[50],"a":[51],"unified":[52],"framework":[53],"that":[54,117,145,173],"captures":[55],"two":[56,76],"complementary":[57],"settings:":[58],"dynamic":[61,110],"transitions":[62,111],"during":[63],"exploration,":[65],"static":[69,113],"layouts":[70,166],"reasoning.":[73],"We":[74,96,170],"formulate":[75],"bidirectional":[77],"tasks,":[78],"Num2Space":[79],"Space2Num,":[81],"to":[82,121,131,155,162,190],"evaluate":[83],"how":[84],"well":[85],"VLMs":[86,101,147],"map":[87],"between":[88],"vision-side":[89],"structure":[91],"language-side":[93],"representations.":[95],"systematically":[97],"study":[98],"current":[100,146],"truly":[102],"understand":[103],"values":[105],"settings.":[108],"Across":[109],"layouts,":[114],"find":[116],"models":[118],"largely":[119],"fail":[120,161],"ground":[122],"meaning":[126],"often":[128],"perform":[129],"close":[130],"random":[132],"guess.":[133],"Through":[134],"error":[135],"analysis,":[136,139],"reasoning":[137,175,193],"trace":[138],"controlled":[141],"interventions,":[142],"show":[144,172],"rely":[148],"heavily":[149],"on":[150],"shallow":[151],"cues,":[153],"struggle":[154],"build":[156],"stable":[157],"coordinate-aware":[158],"representations,":[159],"abstract":[163],"structured":[164],"from":[167],"visual":[168],"observations.":[169],"further":[171],"explicit":[174],"provides":[176],"only":[177],"marginal":[178],"gains,":[179],"while":[180],"tuning":[181],"can":[182],"partially":[183],"improve":[184],"transfer":[189],"external":[191],"benchmarks.":[194]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-26T00:00:00"}
