{"id":"https://openalex.org/W4415709389","doi":"https://doi.org/10.1109/icme59968.2025.11208990","title":"Spatial 3D-LLM : Exploring Spatial Awareness in 3D Vision-Language Models","display_name":"Spatial 3D-LLM : Exploring Spatial Awareness in 3D Vision-Language Models","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415709389","doi":"https://doi.org/10.1109/icme59968.2025.11208990"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11208990","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11208990","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047533435","display_name":"Xiaoyan Wang","orcid":"https://orcid.org/0000-0003-3477-0102"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xiaoyan Wang","raw_affiliation_strings":["Beijing Digital Native Digital City Research Center"],"affiliations":[{"raw_affiliation_string":"Beijing Digital Native Digital City Research Center","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074707604","display_name":"Zeju Li","orcid":"https://orcid.org/0000-0002-4608-2959"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeju Li","raw_affiliation_strings":["Beijing Digital Native Digital City Research Center"],"affiliations":[{"raw_affiliation_string":"Beijing Digital Native Digital City Research Center","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103166791","display_name":"Yifan Xu","orcid":"https://orcid.org/0000-0003-2467-888X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yifan Xu","raw_affiliation_strings":["Beijing Digital Native Digital City Research Center"],"affiliations":[{"raw_affiliation_string":"Beijing Digital Native Digital City Research Center","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102025297","display_name":"Jing Qi","orcid":"https://orcid.org/0000-0002-5688-5579"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiaxing Qi","raw_affiliation_strings":["Beijing Digital Native Digital City Research Center"],"affiliations":[{"raw_affiliation_string":"Beijing Digital Native Digital City Research Center","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101489620","display_name":"Zhifei Yang","orcid":"https://orcid.org/0009-0004-9314-8051"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhifei Yang","raw_affiliation_strings":["Beijing Digital Native Digital City Research Center"],"affiliations":[{"raw_affiliation_string":"Beijing Digital Native Digital City Research Center","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067770870","display_name":"Ruifei Ma","orcid":"https://orcid.org/0000-0002-2546-3867"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ruifei Ma","raw_affiliation_strings":["Beijing Digital Native Digital City Research Center"],"affiliations":[{"raw_affiliation_string":"Beijing Digital Native Digital City Research Center","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062045482","display_name":"Xiangde Liu","orcid":"https://orcid.org/0000-0003-3087-0472"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiangde Liu","raw_affiliation_strings":["Beijing Digital Native Digital City Research Center"],"affiliations":[{"raw_affiliation_string":"Beijing Digital Native Digital City Research Center","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100460182","display_name":"Chao Zhang","orcid":"https://orcid.org/0000-0002-5583-0066"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chao Zhang","raw_affiliation_strings":["Beijing Digital Native Digital City Research Center"],"affiliations":[{"raw_affiliation_string":"Beijing Digital Native Digital City Research Center","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5047533435"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.428,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.86960617,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9168999791145325,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9168999791145325,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.017500000074505806,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.005100000184029341,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spatial-contextual-awareness","display_name":"Spatial contextual awareness","score":0.7423999905586243},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.6877999901771545},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.5767999887466431},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5691999793052673},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5573999881744385},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4986000061035156},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.49399998784065247},{"id":"https://openalex.org/keywords/spatial-relation","display_name":"Spatial relation","score":0.46889999508857727},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4564000070095062}],"concepts":[{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.7423999905586243},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7343999743461609},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.6877999901771545},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.5767999887466431},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5691999793052673},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5573999881744385},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5346999764442444},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4986000061035156},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.49399998784065247},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.46889999508857727},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4564000070095062},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.4546999931335449},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.42250001430511475},{"id":"https://openalex.org/C2777897806","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3D modeling","level":2,"score":0.4221999943256378},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.3871000111103058},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.3540000021457672},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.34769999980926514},{"id":"https://openalex.org/C153938966","wikidata":"https://www.wikidata.org/wiki/Q3348148","display_name":"Object-based spatial database","level":4,"score":0.34389999508857727},{"id":"https://openalex.org/C2986522900","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relationship","level":2,"score":0.3361000120639801},{"id":"https://openalex.org/C158709400","wikidata":"https://www.wikidata.org/wiki/Q3578586","display_name":"Spatial ecology","level":2,"score":0.33469998836517334},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.32440000772476196},{"id":"https://openalex.org/C3019007443","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3d model","level":2,"score":0.32170000672340393},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2953999936580658},{"id":"https://openalex.org/C2778662690","wikidata":"https://www.wikidata.org/wiki/Q3125339","display_name":"Spatial ability","level":3,"score":0.2897000014781952},{"id":"https://openalex.org/C203689450","wikidata":"https://www.wikidata.org/wiki/Q2302053","display_name":"Spatial database","level":3,"score":0.2890999913215637},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27399998903274536},{"id":"https://openalex.org/C42629822","wikidata":"https://www.wikidata.org/wiki/Q1346408","display_name":"Geocoding","level":2,"score":0.2702000141143799}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11208990","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11208990","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W2101105183","https://openalex.org/W2594519801","https://openalex.org/W2988715931","https://openalex.org/W3107521863","https://openalex.org/W3182910454","https://openalex.org/W4226376247","https://openalex.org/W4390872495","https://openalex.org/W4390874439","https://openalex.org/W4394773594","https://openalex.org/W4401416853","https://openalex.org/W4401417048","https://openalex.org/W4401991200","https://openalex.org/W4402716288","https://openalex.org/W4402716423","https://openalex.org/W4403808973"],"related_works":[],"abstract_inverted_index":{"New":[0],"era":[1],"has":[2],"unlocked":[3],"exciting":[4],"possibilities":[5],"for":[6,72],"extending":[7],"Large":[8],"Language":[9],"Models":[10],"(LLMs)":[11],"to":[12,35,45,68,111,137],"tackle":[13],"3D":[14,20,28,53,64,73,82,108,122,127,133,158],"vision-language":[15,74,159],"tasks.":[16],"However,":[17],"most":[18],"existing":[19],"multimodal":[21],"LLMs":[22],"(MLLMs)":[23],"rely":[24],"on":[25],"compressing":[26],"holistic":[27],"scene":[29,109],"information":[30,100],"or":[31],"segmenting":[32],"independent":[33],"objects":[34],"perform":[36],"these":[37,57],"tasks,":[38,160],"which":[39],"limits":[40],"their":[41],"spatial":[42,70,79,93,99,141,168,175],"awareness":[43,71,94,142,169],"due":[44],"insufficient":[46],"representation":[47],"of":[48,81,157,171],"the":[49,78,102,139,162],"richness":[50],"inherent":[51],"in":[52],"scenes.":[54,83],"To":[55],"overcome":[56],"limitations,":[58],"we":[59,117],"propose":[60],"Spatial":[61,84,148],"3D-LLM,":[62],"a":[63,91,132,154],"MLLM":[65],"specifically":[66],"designed":[67],"enhance":[69],"tasks":[75],"by":[76],"enriching":[77],"embeddings":[80,110],"3D-LLM":[85,149],"integrates":[86],"an":[87],"LLM":[88],"backbone":[89],"with":[90],"progressive":[92,167],"scheme":[95,170],"that":[96,147],"progressively":[97],"captures":[98],"as":[101,113],"perception":[103],"field":[104],"expands,":[105],"generating":[106],"location-enriched":[107],"serve":[112],"visual":[114],"prompts.":[115],"Furthermore,":[116],"introduce":[118],"two":[119],"novel":[120],"tasks:":[121],"object":[123],"distance":[124],"measurement":[125],"and":[126,130],"layout":[128],"editing,":[129],"construct":[131],"instruction":[134],"dataset,":[135],"MODEL,":[136],"evaluate":[138],"model\u2019s":[140],"capabilities.":[143],"Experimental":[144],"results":[145],"demonstrate":[146],"achieves":[150],"state-of-the-art":[151],"performance":[152],"across":[153],"wide":[155],"range":[156],"revealing":[161],"improvements":[163],"stemmed":[164],"from":[165],"our":[166],"mining":[172],"more":[173],"profound":[174],"information.":[176],"Our":[177],"code":[178],"is":[179],"available":[180],"at":[181],"https://github.com/bjshuyuan/Spatial-3D-LLM.":[182]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-30T00:00:00"}
