{"id":"https://openalex.org/W7134833888","doi":"https://doi.org/10.48550/arxiv.2603.07751","title":"3ViewSense: Spatial and Mental Perspective Reasoning from Orthographic Views in Vision-Language Models","display_name":"3ViewSense: Spatial and Mental Perspective Reasoning from Orthographic Views in Vision-Language Models","publication_year":2026,"publication_date":"2026-03-08","ids":{"openalex":"https://openalex.org/W7134833888","doi":"https://doi.org/10.48550/arxiv.2603.07751"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.07751","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106163705","display_name":"Shaoxiong Zhan","orcid":"https://orcid.org/0009-0000-1150-2856"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhan, Shaoxiong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123056534","display_name":"Yanlin Lai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lai, Yanlin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128631476","display_name":"Zheng Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128648717","display_name":"Hai Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Hai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128684923","display_name":"Shen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128688022","display_name":"Xiaodong Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Xiaodong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071102832","display_name":"Zijian Lin","orcid":"https://orcid.org/0009-0006-8701-6815"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Zijian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128654293","display_name":"Wen Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Wen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128662169","display_name":"Hai-Tao Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Hai-Tao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5106163705"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8503999710083008,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8503999710083008,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.04529999941587448,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.022700000554323196,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.6912999749183655},{"id":"https://openalex.org/keywords/orthographic-projection","display_name":"Orthographic projection","score":0.6654999852180481},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.5464000105857849},{"id":"https://openalex.org/keywords/mental-rotation","display_name":"Mental rotation","score":0.5293999910354614},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.5249000191688538},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5234000086784363},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4887999892234802},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.44429999589920044},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4325000047683716}],"concepts":[{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.6912999749183655},{"id":"https://openalex.org/C175694140","wikidata":"https://www.wikidata.org/wiki/Q980329","display_name":"Orthographic projection","level":2,"score":0.6654999852180481},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6184999942779541},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5792999863624573},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.5464000105857849},{"id":"https://openalex.org/C68867621","wikidata":"https://www.wikidata.org/wiki/Q257360","display_name":"Mental rotation","level":3,"score":0.5293999910354614},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.5249000191688538},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5234000086784363},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4887999892234802},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.44429999589920044},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4325000047683716},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4189999997615814},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.3935000002384186},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3700000047683716},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3686999976634979},{"id":"https://openalex.org/C57493831","wikidata":"https://www.wikidata.org/wiki/Q3134666","display_name":"Projection (relational algebra)","level":2,"score":0.36579999327659607},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.36570000648498535},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.3084999918937683},{"id":"https://openalex.org/C166052673","wikidata":"https://www.wikidata.org/wiki/Q83021","display_name":"Empirical evidence","level":2,"score":0.30489999055862427},{"id":"https://openalex.org/C96199812","wikidata":"https://www.wikidata.org/wiki/Q2145290","display_name":"Mental representation","level":3,"score":0.3003000020980835},{"id":"https://openalex.org/C517642484","wikidata":"https://www.wikidata.org/wiki/Q2388514","display_name":"Intelligence analysis","level":2,"score":0.3001999855041504},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.2971000075340271},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.2922999858856201},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.28859999775886536},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C95358315","wikidata":"https://www.wikidata.org/wiki/Q2686231","display_name":"Mental mapping","level":2,"score":0.26260000467300415},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26080000400543213},{"id":"https://openalex.org/C2982912361","wikidata":"https://www.wikidata.org/wiki/Q1851867","display_name":"Mental model","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C2780342009","wikidata":"https://www.wikidata.org/wiki/Q18387795","display_name":"Perspective-taking","level":3,"score":0.2578999996185303},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.2533000111579895}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.07751","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.07751","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.07751","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.07751","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"Large":[1],"Language":[2],"Models":[3,10],"have":[4],"achieved":[5],"Olympiad-level":[6],"logic,":[7],"yet":[8],"Vision-Language":[9],"paradoxically":[11],"falter":[12],"on":[13,81,119,134],"elementary":[14],"spatial":[15,55,75,120,139,150,158],"tasks":[16],"like":[17],"block":[18],"counting.":[19],"This":[20],"capability":[21],"mismatch":[22],"reveals":[23],"a":[24,52,71,86,153],"critical":[25],"``spatial":[26],"intelligence":[27,159],"gap,''":[28],"where":[29],"models":[30],"fail":[31],"to":[32,97],"construct":[33],"coherent":[34],"3D":[35],"mental":[36,113],"representations":[37],"from":[38],"2D":[39],"observations.":[40],"We":[41],"uncover":[42],"this":[43],"gap":[44],"via":[45],"diagnostic":[46],"analyses":[47],"showing":[48],"the":[49,145],"bottleneck":[50],"is":[51],"missing":[53],"view-consistent":[54,138],"interface":[56],"rather":[57],"than":[58],"insufficient":[59],"visual":[60],"features":[61],"or":[62],"weak":[63],"reasoning.":[64,140],"To":[65],"bridge":[66],"this,":[67],"we":[68,84],"introduce":[69],"\\textbf{3ViewSense},":[70],"framework":[72,142],"that":[73,89,124],"grounds":[74],"reasoning":[76,121],"in":[77,160],"Orthographic":[78],"Views.":[79],"Drawing":[80],"engineering":[82],"cognition,":[83],"propose":[85],"``Simulate-and-Reason''":[87],"mechanism":[88],"decomposes":[90],"complex":[91],"scenes":[92],"into":[93],"canonical":[94],"orthographic":[95],"projections":[96],"resolve":[98],"geometric":[99],"ambiguities.":[100],"By":[101],"aligning":[102],"egocentric":[103],"perceptions":[104],"with":[105,131],"these":[106],"allocentric":[107],"references,":[108],"our":[109,125],"method":[110,126],"facilitates":[111],"explicit":[112],"rotation":[114],"and":[115,137,147],"reconstruction.":[116],"Empirical":[117],"results":[118],"benchmarks":[122],"demonstrate":[123],"significantly":[127],"outperforms":[128],"existing":[129],"baselines,":[130],"consistent":[132],"gains":[133],"occlusion-heavy":[135],"counting":[136],"The":[141],"also":[143],"improves":[144],"stability":[146],"consistency":[148],"of":[149],"descriptions,":[151],"offering":[152],"scalable":[154],"path":[155],"toward":[156],"stronger":[157],"multimodal":[161],"systems.":[162]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-11T00:00:00"}
