{"id":"https://openalex.org/W7137914612","doi":"https://doi.org/10.48550/arxiv.2603.15386","title":"RieMind: Geometry-Grounded Spatial Agent for Scene Understanding","display_name":"RieMind: Geometry-Grounded Spatial Agent for Scene Understanding","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7137914612","doi":"https://doi.org/10.48550/arxiv.2603.15386"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.15386","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15386","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.15386","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129720319","display_name":"Fernando Ropero","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ropero, Fernando","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129752957","display_name":"Erkin Turkoz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Turkoz, Erkin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124786040","display_name":"Daniel Matos","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Matos, Daniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110543003","display_name":"Junqing Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Junqing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072611940","display_name":"Antonio Ruiz\u2013Cort\u00e9s","orcid":"https://orcid.org/0000-0001-9827-1834"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ruiz, Antonio","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124787326","display_name":"Yanfeng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yanfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129670283","display_name":"Lu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Lu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008082360","display_name":"Mingwei Sun","orcid":"https://orcid.org/0000-0003-4613-244X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Mingwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5042432324","display_name":"Yejie Wang","orcid":"https://orcid.org/0009-0000-7361-1679"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yongliang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5129720319"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9707000255584717,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9707000255584717,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.010499999858438969,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.002300000051036477,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.689300000667572},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.6467000246047974},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.6392999887466431},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5077999830245972},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4927000105381012},{"id":"https://openalex.org/keywords/commonsense-reasoning","display_name":"Commonsense reasoning","score":0.4603999853134155},{"id":"https://openalex.org/keywords/decoupling","display_name":"Decoupling (probability)","score":0.44339999556541443},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.43650001287460327},{"id":"https://openalex.org/keywords/spatial-relation","display_name":"Spatial relation","score":0.43059998750686646}],"concepts":[{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.689300000667572},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.6467000246047974},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.6392999887466431},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6351000070571899},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5856999754905701},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5077999830245972},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4927000105381012},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.4603999853134155},{"id":"https://openalex.org/C205606062","wikidata":"https://www.wikidata.org/wiki/Q5249645","display_name":"Decoupling (probability)","level":2,"score":0.44339999556541443},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.43650001287460327},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.43059998750686646},{"id":"https://openalex.org/C179372163","wikidata":"https://www.wikidata.org/wiki/Q1406181","display_name":"Scene graph","level":3,"score":0.41510000824928284},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3944000005722046},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.388700008392334},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.3781999945640564},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.3702999949455261},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3578999936580658},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.3433000147342682},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.32170000672340393},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.320499986410141},{"id":"https://openalex.org/C2777371692","wikidata":"https://www.wikidata.org/wiki/Q2178611","display_name":"Spatial cognition","level":3,"score":0.3156999945640564},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.30070000886917114},{"id":"https://openalex.org/C193611912","wikidata":"https://www.wikidata.org/wiki/Q4677596","display_name":"Active vision","level":2,"score":0.2897000014781952},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.27489998936653137},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.2671000063419342},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.26269999146461487}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.15386","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15386","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.15386","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15386","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Visual":[0],"Language":[1],"Models":[2],"(VLMs)":[3],"have":[4],"increasingly":[5],"become":[6],"the":[7,103,112,137,151],"main":[8],"paradigm":[9],"for":[10,61],"understanding":[11,29],"indoor":[12,64],"scenes,":[13],"but":[14],"they":[15],"still":[16],"struggle":[17],"with":[18,111,185],"metric":[19],"and":[20,39,49,129,155,204],"spatial":[21,32,54,130,152,201],"reasoning.":[22,40,55,217],"Current":[23],"approaches":[24],"rely":[25],"on":[26,136,150],"end-to-end":[27,215],"video":[28],"or":[30],"large-scale":[31],"question":[33],"answering":[34],"fine-tuning,":[35],"inherently":[36],"coupling":[37],"perception":[38,48,95],"In":[41],"this":[42],"paper,":[43],"we":[44,101,134,156],"investigate":[45],"whether":[46],"decoupling":[47],"reasoning":[50,66,99,153,202],"leads":[51],"to":[52,168,175,190,213],"improved":[53],"We":[56],"propose":[57],"an":[58,69,72,143],"agentic":[59,179],"framework":[60],"static":[62,138],"3D":[63,74],"scene":[65,75,84,113],"that":[67,119,158,195,206],"grounds":[68],"LLM":[70],"in":[71],"explicit":[73,196],"graph":[76],"(3DSG).":[77],"Rather":[78],"than":[79,163],"ingesting":[80],"videos":[81],"directly,":[82],"each":[83],"is":[85,160],"represented":[86],"as":[87,124],"a":[88,93,210],"persistent":[89],"3DSG":[90,104],"constructed":[91],"by":[92,166],"dedicated":[94],"module.":[96],"To":[97],"isolate":[98],"performance,":[100,154,184,203],"instantiate":[102],"from":[105],"ground-truth":[106],"annotations.":[107],"The":[108,132],"agent":[109],"interacts":[110],"exclusively":[114],"through":[115],"structured":[116,207],"geometric":[117,197],"tools":[118],"expose":[120],"fundamental":[121],"properties":[122],"such":[123],"object":[125],"dimensions,":[126],"distances,":[127],"poses,":[128],"relationships.":[131],"results":[133],"obtain":[135],"split":[139],"of":[140],"VSI-Bench":[141],"provide":[142],"upper":[144],"bound":[145],"under":[146],"ideal":[147],"perceptual":[148],"conditions":[149],"find":[157],"it":[159],"significantly":[161,182],"higher":[162],"previous":[164],"works,":[165],"up":[167],"16\\%,":[169],"without":[170],"task":[171],"specific":[172],"fine-tuning.":[173],"Compared":[174],"base":[176],"VLMs,":[177],"our":[178],"variant":[180],"achieves":[181],"better":[183],"average":[186],"improvements":[187],"between":[188],"33\\%":[189],"50\\%.":[191],"These":[192],"findings":[193],"indicate":[194],"grounding":[198],"substantially":[199],"improves":[200],"suggest":[205],"representations":[208],"offer":[209],"compelling":[211],"alternative":[212],"purely":[214],"visual":[216]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-18T00:00:00"}
