{"id":"https://openalex.org/W7162466198","doi":"https://doi.org/10.48550/arxiv.2605.25334","title":"Dual-Pathway Geometry-Aware MLLM for Spatial Intelligence","display_name":"Dual-Pathway Geometry-Aware MLLM for Spatial Intelligence","publication_year":2026,"publication_date":"2026-05-25","ids":{"openalex":"https://openalex.org/W7162466198","doi":"https://doi.org/10.48550/arxiv.2605.25334"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.25334","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25334","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.25334","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137053661","display_name":"Yufei Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Yufei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137041895","display_name":"Xuhan Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Xuhan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051354149","display_name":"Zide Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zide","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091098647","display_name":"Chunpeng Zhou","orcid":"https://orcid.org/0000-0002-7675-8403"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Chunpeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136992024","display_name":"Chenfeng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Chenfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137078791","display_name":"Yongchao Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Yongchao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137020547","display_name":"Yunnan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yunnan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137057620","display_name":"Jiawei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jiawei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137062525","display_name":"Pengfei Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Pengfei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137082522","display_name":"Wei Zhai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhai, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137035885","display_name":"Yang Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136997552","display_name":"Zheng-Jun Zha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zha, Zheng-Jun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8787000179290771,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8787000179290771,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11904","display_name":"Spatial Cognition and Navigation","score":0.03460000082850456,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.01679999940097332,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5127999782562256},{"id":"https://openalex.org/keywords/point-cloud","display_name":"Point cloud","score":0.49079999327659607},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.482699990272522},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4514999985694885},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.4172999858856201},{"id":"https://openalex.org/keywords/landmark","display_name":"Landmark","score":0.4020000100135803},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.38100001215934753},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.38089999556541443},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.366100013256073},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.3492000102996826}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7181000113487244},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6848000288009644},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5127999782562256},{"id":"https://openalex.org/C131979681","wikidata":"https://www.wikidata.org/wiki/Q1899648","display_name":"Point cloud","level":2,"score":0.49079999327659607},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.482699990272522},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4514999985694885},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.4172999858856201},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4032000005245209},{"id":"https://openalex.org/C2780297707","wikidata":"https://www.wikidata.org/wiki/Q4895393","display_name":"Landmark","level":2,"score":0.4020000100135803},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.38100001215934753},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.38089999556541443},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.366100013256073},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3492000102996826},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3479999899864197},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.337799996137619},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3203999996185303},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.31310001015663147},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3124000132083893},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.31130000948905945},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.30959999561309814},{"id":"https://openalex.org/C102634674","wikidata":"https://www.wikidata.org/wiki/Q868473","display_name":"Smoothness","level":2,"score":0.30059999227523804},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.29840001463890076},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.29409998655319214},{"id":"https://openalex.org/C2781020372","wikidata":"https://www.wikidata.org/wiki/Q533093","display_name":"On the fly","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.2791999876499176},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.2768999934196472},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.26489999890327454},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2556999921798706},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.2554999887943268},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.2540000081062317},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.2531999945640564}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.25334","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25334","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.25334","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25334","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"score":0.7522494196891785,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Spatial":[0,71],"understanding":[1],"of":[2,15,59,84,103],"the":[3,56,117,128,145],"physical":[4],"world":[5],"from":[6,116,131,192],"2D":[7],"visual":[8,119,151,189],"inputs":[9],"hinges":[10],"on":[11,136,205],"two":[12,101,129],"complementary":[13],"forms":[14,83],"geometric":[16,85],"knowledge:":[17],"holistic":[18],"3D":[19],"structural":[20,114],"perception":[21],"and":[22,54,112,153,187],"fine-grained":[23],"metric":[24,110],"scale":[25],"estimation.":[26],"Existing":[27],"multimodal":[28],"large":[29],"language":[30],"models":[31],"(MLLMs)":[32],"typically":[33],"address":[34],"only":[35,75],"one":[36],"facet,":[37],"ingesting":[38],"either":[39],"depth":[40],"maps":[41],"or":[42],"point":[43],"clouds":[44],"as":[45,78,163,168],"additional":[46],"model":[47,169],"inputs,":[48],"which":[49,99,160],"incurs":[50],"substantial":[51],"computational":[52],"overhead":[53],"inherits":[55],"generalization":[57],"limitations":[58],"upstream":[60],"prediction":[61],"models.":[62],"We":[63,171],"propose":[64],"GAMSI,":[65],"a":[66,88,122,174,198],"dual-pathway":[67],"Geometry-Aware":[68],"MLLM":[69],"for":[70],"Intelligence":[72],"that":[73],"takes":[74],"RGB":[76],"images":[77],"input":[79],"while":[80],"internalizing":[81],"both":[82],"prior":[86],"within":[87],"unified":[89],"autoregressive":[90],"backbone.":[91],"Specifically,":[92],"we":[93],"introduce":[94],"Metric-Structure":[95],"Decoupled":[96],"Queries":[97],"(MSDQ)":[98],"employ":[100],"groups":[102],"learnable":[104],"queries":[105],"to":[106,149],"respectively":[107],"extract":[108],"dense":[109],"signals":[111],"sparse":[113],"cues":[115,147],"shared":[118],"context,":[120],"with":[121,156,197],"task-decoupled":[123],"attention":[124],"mask":[125],"further":[126,172],"preventing":[127],"pathways":[130],"contaminating":[132],"each":[133],"other.":[134],"Building":[135],"this,":[137],"an":[138],"Expert-Guided":[139],"Visual":[140],"Grounding":[141],"(EVG)":[142],"module":[143],"projects":[144],"aggregated":[146],"back":[148],"frame-level":[150],"features":[152],"aligns":[154],"them":[155],"vision":[157],"foundation":[158],"models,":[159],"serve":[161],"purely":[162],"training-time":[164],"supervision,":[165],"rather":[166],"than":[167],"inputs.":[170],"build":[173],"multi-task":[175],"spatial":[176,207],"instruction-tuning":[177],"dataset":[178],"(MTS)":[179],"comprising":[180],"152{,}776":[181],"samples":[182],"spanning":[183],"13":[184],"task":[185],"types":[186],"three":[188],"modalities,":[190],"consolidated":[191],"six":[193],"public":[194],"datasets.":[195],"Trained":[196],"two-stage":[199],"curriculum,":[200],"GAMSI":[201],"achieves":[202],"state-of-the-art":[203],"performance":[204],"seven":[206],"intelligence":[208],"benchmarks.":[209]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-27T00:00:00"}
