{"id":"https://openalex.org/W7160264719","doi":"https://doi.org/10.48550/arxiv.2605.02130","title":"From Where Things Are to What They Are For: Benchmarking Spatial-Functional Intelligence in Multimodal LLMs","display_name":"From Where Things Are to What They Are For: Benchmarking Spatial-Functional Intelligence in Multimodal LLMs","publication_year":2026,"publication_date":"2026-05-04","ids":{"openalex":"https://openalex.org/W7160264719","doi":"https://doi.org/10.48550/arxiv.2605.02130"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.02130","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02130","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.02130","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135323802","display_name":"Le Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Le","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135401769","display_name":"Jihan Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jihan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079829875","display_name":"Soundarya Krishnan","orcid":"https://orcid.org/0000-0003-0486-6285"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Krishnan, Soundarya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135358058","display_name":"Jimit Majmudar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Majmudar, Jimit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002684022","display_name":"Xiou Ge","orcid":"https://orcid.org/0000-0002-8263-2073"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Xiou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135296815","display_name":"Prasoon Puri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Puri, Prasoon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135309885","display_name":"Prathamesh Nandkishor Saraf","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saraf, Prathamesh Nandkishor","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135400740","display_name":"Shruti Bhargava","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhargava, Shruti","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077148644","display_name":"Dhivya Piraviperumal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Piraviperumal, Dhivya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034610099","display_name":"Yinan Ling","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ling, Yinan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135300381","display_name":"Cindy Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Cindy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135415886","display_name":"Hong Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Hong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063960231","display_name":"Aishwarya Agrawal","orcid":"https://orcid.org/0000-0002-8620-8077"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Agrawal, Aishwarya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5039215196","display_name":"Bo-Hsiang Tseng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tseng, Bo-Hsiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8664000034332275,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8664000034332275,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.019200000911951065,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.01769999973475933,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/affordance","display_name":"Affordance","score":0.6880000233650208},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.642300009727478},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.620199978351593},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5763999819755554},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.48100000619888306},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.3926999866962433},{"id":"https://openalex.org/keywords/grounded-theory","display_name":"Grounded theory","score":0.3781999945640564},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.374099999666214}],"concepts":[{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.6880000233650208},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.642300009727478},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6255000233650208},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.620199978351593},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5763999819755554},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48500001430511475},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.48100000619888306},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.46549999713897705},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3926999866962433},{"id":"https://openalex.org/C156325361","wikidata":"https://www.wikidata.org/wiki/Q1152864","display_name":"Grounded theory","level":3,"score":0.3781999945640564},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.374099999666214},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.36640000343322754},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3614000082015991},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.36010000109672546},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.3474000096321106},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.34439998865127563},{"id":"https://openalex.org/C105409693","wikidata":"https://www.wikidata.org/wiki/Q5937824","display_name":"Human intelligence","level":2,"score":0.3255999982357025},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31709998846054077},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.3131999969482422},{"id":"https://openalex.org/C2777371692","wikidata":"https://www.wikidata.org/wiki/Q2178611","display_name":"Spatial cognition","level":3,"score":0.3086000084877014},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.26570001244544983},{"id":"https://openalex.org/C201638289","wikidata":"https://www.wikidata.org/wiki/Q457396","display_name":"Ambient intelligence","level":2,"score":0.26440000534057617},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.02130","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02130","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.02130","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02130","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Human-level":[0],"agentic":[1],"intelligence":[2],"extends":[3],"beyond":[4],"low-level":[5],"geometric":[6,26],"perception,":[7,131],"evolving":[8],"from":[9,68],"recognizing":[10],"where":[11],"things":[12],"are":[13,18],"to":[14,129,143],"understanding":[15,89],"what":[16],"they":[17,35],"for.":[19],"While":[20],"existing":[21],"benchmarks":[22],"effectively":[23],"evaluate":[24],"the":[25,40,54],"perception":[27],"capabilities":[28],"of":[29,38,80],"multimodal":[30,177],"large":[31],"language":[32],"models":[33,128],"(MLLMs),":[34],"fall":[36],"short":[37],"probing":[39],"higher-order":[41],"cognitive":[42],"abilities":[43],"required":[44],"for":[45,167],"grounded":[46,159,176],"intelligence.":[47,160],"To":[48],"address":[49],"this":[50],"gap,":[51],"we":[52],"introduce":[53],"Spatial-Functional":[55],"Intelligence":[56],"Benchmark":[57],"(SFI-Bench),":[58],"a":[59,154,164],"video-based":[60],"benchmark":[61,111],"with":[62,147],"over":[63],"1,500":[64],"expert-annotated":[65],"questions":[66],"derived":[67],"diverse":[69],"egocentric":[70],"indoor":[71],"video":[72],"scans.":[73],"SFI-Bench":[74,161],"systematically":[75],"evaluates":[76],"two":[77],"complementary":[78],"dimensions":[79],"advanced":[81],"reasoning:":[82],"(1)":[83],"Structured":[84],"Spatial":[85],"Reasoning,":[86,100],"which":[87,101],"requires":[88],"complex":[90],"layouts":[91],"and":[92,97,106,123,133,150,174],"forming":[93],"coherent":[94],"spatial":[95,145],"representations,":[96],"(2)":[98],"Functional":[99],"involves":[102],"inferring":[103],"object":[104],"affordances":[105],"their":[107],"context-dependent":[108],"utility.":[109],"The":[110],"includes":[112],"tasks":[113],"such":[114],"as":[115],"conditional":[116],"counting,":[117],"multi-hop":[118],"relational":[119],"reasoning,":[120],"functional":[121,148],"pairing,":[122],"knowledge-grounded":[124],"troubleshooting,":[125],"directly":[126],"challenging":[127],"integrate":[130],"memory,":[132],"inference.":[134],"Our":[135],"experiments":[136],"reveal":[137],"that":[138],"current":[139],"MLLMs":[140],"consistently":[141],"struggle":[142],"combine":[144],"memory":[146],"reasoning":[149],"external":[151],"knowledge,":[152],"highlighting":[153],"critical":[155],"bottleneck":[156],"in":[157],"achieving":[158],"therefore":[162],"provides":[163],"diagnostic":[165],"tool":[166],"measuring":[168],"progress":[169],"toward":[170],"more":[171],"cognitively":[172],"capable":[173],"truly":[175],"agents.":[178]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-06T00:00:00"}
