{"id":"https://openalex.org/W4415537391","doi":"https://doi.org/10.1145/3746027.3754918","title":"A Neural Representation Framework with LLM-Driven Spatial Reasoning for Open-Vocabulary 3D Visual Grounding","display_name":"A Neural Representation Framework with LLM-Driven Spatial Reasoning for Open-Vocabulary 3D Visual Grounding","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415537391","doi":"https://doi.org/10.1145/3746027.3754918"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3754918","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754918","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhenyang Liu","orcid":"https://orcid.org/0000-0003-1323-7632"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhenyang Liu","raw_affiliation_strings":["Fudan University, Shanghai, China and Shanghai Innovation Institute, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-1323-7632","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China and Shanghai Innovation Institute, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090851249","display_name":"Sixiao Zheng","orcid":"https://orcid.org/0000-0001-8324-1528"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sixiao Zheng","raw_affiliation_strings":["Fudan University, Shanghai, China and Shanghai Innovation Institute, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-8324-1528","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China and Shanghai Innovation Institute, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Siyu Chen","orcid":"https://orcid.org/0009-0005-3467-0197"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Siyu Chen","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0005-3467-0197","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064237960","display_name":"Cairong Zhao","orcid":"https://orcid.org/0000-0001-6745-9674"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cairong Zhao","raw_affiliation_strings":["Tongji University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-6745-9674","affiliations":[{"raw_affiliation_string":"Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063691574","display_name":"Longfei Liang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210154949","display_name":"New Hope Liuhe (China)","ror":"https://ror.org/04h9a4v60","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210154949"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longfei Liang","raw_affiliation_strings":["NeuhHelium Co., Ltd., Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0006-5199-5134","affiliations":[{"raw_affiliation_string":"NeuhHelium Co., Ltd., Shanghai, China","institution_ids":["https://openalex.org/I4210154949"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003418019","display_name":"Xiangyang Xue","orcid":"https://orcid.org/0000-0002-4897-9209"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiangyang Xue","raw_affiliation_strings":["Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-4897-9209","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5084959430","display_name":"Yanwei Fu","orcid":"https://orcid.org/0000-0002-6595-6893"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanwei Fu","raw_affiliation_strings":["Fudan University, Shanghai, China and Shanghai Innovation Institute, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-6595-6893","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China and Shanghai Innovation Institute, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I24943067"],"apc_list":null,"apc_paid":null,"fwci":1.1332,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.83507459,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1042","last_page":"1051"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.6751000285148621},{"id":"https://openalex.org/keywords/spatial-relation","display_name":"Spatial relation","score":0.6624000072479248},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.5900999903678894},{"id":"https://openalex.org/keywords/relation","display_name":"Relation (database)","score":0.47870001196861267},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.47690001130104065},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4708999991416931},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4311999976634979},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4203999936580658}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7736999988555908},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.6751000285148621},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.6624000072479248},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.661300003528595},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.5900999903678894},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.47870001196861267},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.47690001130104065},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4708999991416931},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4311999976634979},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4203999936580658},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.41609999537467957},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.38909998536109924},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.38670000433921814},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.3864000141620636},{"id":"https://openalex.org/C2776035091","wikidata":"https://www.wikidata.org/wiki/Q7928819","display_name":"Viewpoints","level":2,"score":0.3797999918460846},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.37529999017715454},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.37130001187324524},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3158999979496002},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.3125},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.3077000081539154},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2879999876022339}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3754918","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754918","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2478649345","display_name":null,"funder_award_id":"24511103100","funder_id":"https://openalex.org/F4320321885","funder_display_name":"Science and Technology Commission of Shanghai Municipality"}],"funders":[{"id":"https://openalex.org/F4320321885","display_name":"Science and Technology Commission of Shanghai Municipality","ror":"https://ror.org/03kt66j61"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W2277195237","https://openalex.org/W2489434015","https://openalex.org/W2912067802","https://openalex.org/W2963109634","https://openalex.org/W3095974555","https://openalex.org/W3107521863","https://openalex.org/W3133833192","https://openalex.org/W3205239453","https://openalex.org/W3216551675","https://openalex.org/W4200150166","https://openalex.org/W4293811398","https://openalex.org/W4312280420","https://openalex.org/W4312420092","https://openalex.org/W4312912313","https://openalex.org/W4384825045","https://openalex.org/W4385318467","https://openalex.org/W4386065742","https://openalex.org/W4386075561","https://openalex.org/W4390872744","https://openalex.org/W4390874575","https://openalex.org/W4402726969"],"related_works":[],"abstract_inverted_index":{"Open-vocabulary":[0],"3D":[1,30,40,93,123,157,209,228,247,269],"visual":[2,116,124,161,270],"grounding":[3,271],"aims":[4],"to":[5,61,137,166,205,225,237],"localize":[6,63,206],"target":[7,50,208],"objects":[8,51],"based":[9,211],"on":[10,75,212],"free-form":[11],"language":[12,31,57,69,90,108,131,175,218],"queries,":[13,70,132],"which":[14],"is":[15,194,222],"crucial":[16],"for":[17,121,145],"embodied":[18],"AI":[19],"applications":[20],"such":[21,71,240],"as":[22,72,233,241],"autonomous":[23],"navigation,":[24],"robotics,":[25],"and":[26,45,92,141,149,164,176,183],"augmented":[27],"reality.":[28],"Learning":[29],"fields":[32],"through":[33],"neural":[34,103,229,263],"representations":[35],"enables":[36],"accurate":[37],"understanding":[38],"of":[39,49],"scenes":[41],"from":[42,82],"limited":[43,224],"viewpoints":[44],"facilitates":[46],"the":[47,76,146,187,198,207,213,217],"localization":[48],"in":[52,68,88,130,156,201,216,268],"complex":[53],"environments.":[54],"However,":[55],"existing":[56],"field":[58,120,173,193],"methods":[59],"struggle":[60],"accurately":[62],"instances":[64],"using":[65,179,197],"spatial":[66,86,111,128,139,150,154,214,275],"relations":[67,87,140],"''the":[73],"book":[74],"chair.''":[77],"This":[78,172],"limitation":[79],"mainly":[80],"arises":[81],"inadequate":[83],"reasoning":[84,112,129,155,276],"about":[85],"both":[89],"queries":[91],"scenes.":[94],"In":[95],"this":[96],"work,":[97],"we":[98],"propose":[99],"SpatialReasoner,":[100],"a":[101,115,168,202,226,234],"novel":[102],"representation-based":[104],"framework":[105,235,256],"with":[106],"large":[107],"model":[109],"(LLM)-driven":[110],"that":[113,254],"constructs":[114],"properties-enhanced":[117],"hierarchical":[118,169,203],"feature":[119,170],"open-vocabulary":[122],"grounding.":[125],"To":[126,152],"enable":[127,153],"SpatialReasoner":[133,159,221],"fine-tunes":[134],"an":[135],"LLM":[136],"capture":[138],"explicitly":[142],"infer":[143],"instructions":[144,200],"target,":[147],"anchor,":[148],"relation.":[151],"scenes,":[158],"incorporates":[160],"properties":[162],"(opacity":[163],"color)":[165],"construct":[167],"field.":[171],"represents":[174],"instance":[177,210],"features":[178,182],"distilled":[180],"CLIP":[181],"masks":[184],"extracted":[185],"via":[186],"Segment":[188],"Anything":[189],"Model":[190],"(SAM).":[191],"The":[192],"then":[195],"queried":[196],"inferred":[199],"manner":[204],"relation":[215],"query.":[219],"Notably,":[220],"not":[223],"specific":[227],"representation;":[230],"it":[231],"serves":[232],"adaptable":[236],"various":[238],"representations,":[239,264],"Neural":[242],"Radiance":[243],"Fields":[244],"(NeRF)":[245],"or":[246],"Gaussian":[248],"Splatting":[249],"(3DGS).":[250],"Extensive":[251],"experiments":[252],"show":[253],"our":[255],"can":[257],"be":[258],"seamlessly":[259],"integrated":[260],"into":[261],"different":[262],"outperforming":[265],"baseline":[266],"models":[267],"while":[272],"empowering":[273],"their":[274],"capability.":[277],"Project":[278],"Homepage:ZhenyangLiu.github.io/SpatialReasoner.":[279]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-25T00:00:00"}
