{"id":"https://openalex.org/W7139107915","doi":"https://doi.org/10.48550/arxiv.2603.16461","title":"GAP-MLLM: Geometry-Aligned Pre-training for Activating 3D Spatial Perception in Multimodal Large Language Models","display_name":"GAP-MLLM: Geometry-Aligned Pre-training for Activating 3D Spatial Perception in Multimodal Large Language Models","publication_year":2026,"publication_date":"2026-03-17","ids":{"openalex":"https://openalex.org/W7139107915","doi":"https://doi.org/10.48550/arxiv.2603.16461"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.16461","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.16461","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.16461","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130066318","display_name":"Jiaxin Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Jiaxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129881712","display_name":"Junjun Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Junjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129764358","display_name":"Haijie Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haijie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129835966","display_name":"Youyu Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Youyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129800729","display_name":"Kui Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Kui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5026731366","display_name":"Dave Zhenyu Chen","orcid":"https://orcid.org/0000-0002-3883-1905"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Dave Zhenyu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5130066318"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9316999912261963,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9316999912261963,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.013000000268220901,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.006000000052154064,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/concatenation","display_name":"Concatenation (mathematics)","score":0.5723999738693237},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5680000185966492},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5514000058174133},{"id":"https://openalex.org/keywords/prior-probability","display_name":"Prior probability","score":0.5027999877929688},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.45350000262260437},{"id":"https://openalex.org/keywords/solid-modeling","display_name":"Solid modeling","score":0.3968999981880188},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.38190001249313354},{"id":"https://openalex.org/keywords/cognitive-neuroscience-of-visual-object-recognition","display_name":"Cognitive neuroscience of visual object recognition","score":0.3693999946117401},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.3635999858379364},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.362199991941452}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7343000173568726},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6337000131607056},{"id":"https://openalex.org/C87619178","wikidata":"https://www.wikidata.org/wiki/Q126002","display_name":"Concatenation (mathematics)","level":2,"score":0.5723999738693237},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5680000185966492},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5514000058174133},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.5027999877929688},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.45350000262260437},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.44510000944137573},{"id":"https://openalex.org/C108882727","wikidata":"https://www.wikidata.org/wiki/Q2991685","display_name":"Solid modeling","level":2,"score":0.3968999981880188},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.38190001249313354},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.3693999946117401},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.3635999858379364},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.362199991941452},{"id":"https://openalex.org/C104065381","wikidata":"https://www.wikidata.org/wiki/Q1002535","display_name":"Geometric modeling","level":2,"score":0.35580000281333923},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.33649998903274536},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3303999900817871},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.325300008058548},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3070000112056732},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.2939000129699707},{"id":"https://openalex.org/C155542232","wikidata":"https://www.wikidata.org/wiki/Q736111","display_name":"Optical flow","level":3,"score":0.2897000014781952},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2897000014781952},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.289000004529953},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.2865999937057495},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2858999967575073},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.28130000829696655},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C198942812","wikidata":"https://www.wikidata.org/wiki/Q496618","display_name":"Semantic property","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C152565575","wikidata":"https://www.wikidata.org/wiki/Q1124538","display_name":"Conditional random field","level":2,"score":0.27250000834465027},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.26489999890327454},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.26460000872612},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.260699987411499},{"id":"https://openalex.org/C109950114","wikidata":"https://www.wikidata.org/wiki/Q4464732","display_name":"3D reconstruction","level":2,"score":0.25459998846054077},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2538999915122986},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.25369998812675476}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.16461","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.16461","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.16461","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.16461","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"demonstrate":[5,162],"exceptional":[6],"semantic":[7,131,158],"reasoning":[8],"but":[9,57],"struggle":[10],"with":[11,145],"3D":[12,27,43,175,178,182],"spatial":[13],"perception":[14,111],"when":[15],"restricted":[16],"to":[17,39,68,78,92,126],"pure":[18],"RGB":[19],"inputs.":[20],"Despite":[21],"leveraging":[22],"implicit":[23],"geometric":[24,55,70,135,154,167],"priors":[25,155],"from":[26,53,58],"reconstruction":[28],"models,":[29],"image-based":[30],"methods":[31,40],"still":[32],"exhibit":[33],"a":[34,59,103,118,140,146],"notable":[35],"performance":[36,173],"gap":[37,49],"compared":[38],"using":[41],"explicit":[42],"data.":[44],"We":[45],"argue":[46],"that":[47,107,122,163],"this":[48,98],"does":[50],"not":[51],"arise":[52],"insufficient":[54],"priors,":[56],"misalignment":[60],"in":[61],"the":[62,124],"training":[63],"paradigm:":[64],"text-dominated":[65],"fine-tuning":[66],"fails":[67],"activate":[69],"representations":[71],"within":[72],"MLLMs.":[73],"Existing":[74],"approaches":[75],"typically":[76],"resort":[77],"naive":[79],"feature":[80,168],"concatenation":[81],"and":[82,170,181],"optimize":[83],"directly":[84],"for":[85],"downstream":[86,113],"tasks":[87],"without":[88,156],"geometry-specific":[89],"supervision,":[90],"leading":[91],"suboptimal":[93],"structural":[94,110],"utilization.":[95],"To":[96],"address":[97],"limitation,":[99],"we":[100,116,138],"propose":[101],"GAP-MLLM,":[102],"Geometry-Aligned":[104],"Pre-training":[105],"paradigm":[106],"explicitly":[108],"activates":[109],"before":[112],"adaptation.":[114],"Specifically,":[115],"introduce":[117],"visual-prompted":[119],"joint":[120],"task":[121],"compels":[123],"MLLMs":[125],"predict":[127],"sparse":[128],"pointmaps":[129],"alongside":[130],"labels,":[132],"thereby":[133],"enforcing":[134],"awareness.":[136],"Furthermore,":[137],"design":[139],"multi-level":[141],"progressive":[142],"fusion":[143,169],"module":[144],"token-level":[147],"gating":[148],"mechanism,":[149],"enabling":[150],"adaptive":[151],"integration":[152],"of":[153],"suppressing":[157],"reasoning.":[159],"Extensive":[160],"experiments":[161],"GAP-MLLM":[164],"significantly":[165],"enhances":[166,172],"consistently":[171],"across":[174],"visual":[176],"grounding,":[177],"dense":[179],"captioning,":[180],"video":[183],"object":[184],"detection":[185],"tasks.":[186]},"counts_by_year":[],"updated_date":"2026-03-20T20:54:20.808490","created_date":"2026-03-20T00:00:00"}
