{"id":"https://openalex.org/W4415536948","doi":"https://doi.org/10.1145/3746027.3754985","title":"SeqVLM: Proposal-Guided Multi-View Sequences Reasoning via VLM for Zero-Shot 3D Visual Grounding","display_name":"SeqVLM: Proposal-Guided Multi-View Sequences Reasoning via VLM for Zero-Shot 3D Visual Grounding","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415536948","doi":"https://doi.org/10.1145/3746027.3754985"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3754985","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754985","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006737019","display_name":"Jiawen Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiawen Lin","raw_affiliation_strings":["School of Informatics, Xiamen University, Xiamen, Fujian, China"],"affiliations":[{"raw_affiliation_string":"School of Informatics, Xiamen University, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108909467","display_name":"Shiran Bian","orcid":"https://orcid.org/0000-0003-3000-7679"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiran Bian","raw_affiliation_strings":["School of Informatics, Xiamen University, Xiamen, Fujian, China"],"affiliations":[{"raw_affiliation_string":"School of Informatics, Xiamen University, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003616880","display_name":"Yihang Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yihang Zhu","raw_affiliation_strings":["School of Computer Science, Nanjing University, Nanjing, Jiangsu, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Nanjing University, Nanjing, Jiangsu, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101141679","display_name":"Wenbin Tan","orcid":null},"institutions":[{"id":"https://openalex.org/I75867142","display_name":"Xiamen University of Technology","ror":"https://ror.org/01285e189","country_code":"CN","type":"education","lineage":["https://openalex.org/I75867142"]},{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenbin Tan","raw_affiliation_strings":["School of Informatics, Xiamen University, XiaMen, Fujian, China"],"affiliations":[{"raw_affiliation_string":"School of Informatics, Xiamen University, XiaMen, Fujian, China","institution_ids":["https://openalex.org/I191208505","https://openalex.org/I75867142"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100667178","display_name":"Yachao Zhang","orcid":"https://orcid.org/0000-0002-6153-5004"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yachao Zhang","raw_affiliation_strings":["School of Informatics, Xiamen University, Xiamen, Fujian, China"],"affiliations":[{"raw_affiliation_string":"School of Informatics, Xiamen University, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110042813","display_name":"Yuan Xie","orcid":"https://orcid.org/0000-0001-6945-7437"},"institutions":[{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuan Xie","raw_affiliation_strings":["School of Computer Science and Technology, East China Normal University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, East China Normal University, Shanghai, China","institution_ids":["https://openalex.org/I66867065"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5076485255","display_name":"Yanyun Qu","orcid":"https://orcid.org/0000-0002-8926-4162"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanyun Qu","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, Fujian, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I191208505"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5006737019"],"corresponding_institution_ids":["https://openalex.org/I191208505"],"apc_list":null,"apc_paid":null,"fwci":1.428,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.8677017,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"3094","last_page":"3103"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.6323999762535095},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5527999997138977},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5088000297546387},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.45750001072883606},{"id":"https://openalex.org/keywords/point-cloud","display_name":"Point cloud","score":0.4458000063896179},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.435699999332428},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4334999918937683},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.4230000078678131},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.413100004196167},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.4083000123500824}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7947999835014343},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6947000026702881},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.6323999762535095},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5527999997138977},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5088000297546387},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5013999938964844},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.45750001072883606},{"id":"https://openalex.org/C131979681","wikidata":"https://www.wikidata.org/wiki/Q1899648","display_name":"Point cloud","level":2,"score":0.4458000063896179},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.435699999332428},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4334999918937683},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.4230000078678131},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.413100004196167},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.4083000123500824},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.40529999136924744},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.4018999934196472},{"id":"https://openalex.org/C57493831","wikidata":"https://www.wikidata.org/wiki/Q3134666","display_name":"Projection (relational algebra)","level":2,"score":0.3856000006198883},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.3741999864578247},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.3662000000476837},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.33719998598098755},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.32910001277923584},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.32330000400543213},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.32190001010894775},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.3172000050544739},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31690001487731934},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3147999942302704},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.31380000710487366},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.3127000033855438},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.30169999599456787},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.288100004196167},{"id":"https://openalex.org/C108882727","wikidata":"https://www.wikidata.org/wiki/Q2991685","display_name":"Solid modeling","level":2,"score":0.2824000120162964},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2635999917984009},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.2635999917984009},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.25940001010894775},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.2556999921798706},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C2776449333","wikidata":"https://www.wikidata.org/wiki/Q7928781","display_name":"View synthesis","level":3,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3754985","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754985","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W3034949383","https://openalex.org/W3095974555","https://openalex.org/W3107521863","https://openalex.org/W3133833192","https://openalex.org/W3175234951","https://openalex.org/W4214684415","https://openalex.org/W4312274934","https://openalex.org/W4312749817","https://openalex.org/W4312852845","https://openalex.org/W4313162371","https://openalex.org/W4383109105","https://openalex.org/W4386075583","https://openalex.org/W4386075660","https://openalex.org/W4390872020","https://openalex.org/W4390872744","https://openalex.org/W4393065402","https://openalex.org/W4394596988","https://openalex.org/W4399120308","https://openalex.org/W4403577400"],"related_works":[],"abstract_inverted_index":{"3D":[0,9,87,92,132],"Visual":[1],"Grounding":[2],"(3DVG)":[3],"aims":[4],"to":[5,47,135,138,159],"localize":[6],"objects":[7],"in":[8,21,127],"scenes":[10],"using":[11],"natural":[12],"language":[13],"descriptions.":[14],"Although":[15],"supervised":[16],"methods":[17,40,185],"achieve":[18],"higher":[19],"accuracy":[20],"constrained":[22],"settings,":[23],"zero-shot":[24,39,67,184],"3DVG":[25,68,193],"holds":[26],"greater":[27,195],"promise":[28],"for":[29,79],"real-world":[30,73,198],"applications":[31],"since":[32],"eliminating":[33],"scene-specific":[34],"training":[35],"requirements.":[36],"However,":[37],"existing":[38],"face":[41],"challenges":[42],"of":[43,131,178],"spatial-limited":[44],"reasoning":[45,157],"due":[46],"reliance":[48],"on":[49,165],"single-view":[50],"localization,":[51],"and":[52,96,124,168,180,188,197],"contextual":[53,125],"omissions":[54],"or":[55],"detail":[56],"degradation.":[57],"To":[58],"address":[59],"these":[60,113],"issues,":[61],"we":[62,143],"propose":[63],"SeqVLM,":[64],"a":[65,91,145],"novel":[66],"framework":[69],"that":[70,149],"leverages":[71],"multi-view":[72,108],"scene":[74,118],"images":[75],"with":[76],"spatial":[77,122],"information":[78],"target":[80],"object":[81],"reasoning.":[82],"Specifically,":[83],"SeqVLM":[84],"first":[85],"generates":[86],"instance":[88],"proposals":[89,115],"via":[90],"semantic":[93,100],"segmentation":[94],"network":[95],"refines":[97],"them":[98],"through":[99],"filtering,":[101],"retaining":[102],"only":[103],"semantic-relevant":[104],"candidates.":[105],"A":[106],"proposal-guided":[107],"projection":[109],"strategy":[110],"then":[111],"projects":[112],"candidate":[114],"onto":[116],"real":[117],"image":[119],"sequences,":[120],"preserving":[121],"relationships":[123],"details":[126],"the":[128,166],"conversion":[129],"process":[130],"point":[133],"cloud":[134],"images.":[136],"Furthermore,":[137],"mitigate":[139],"VLM":[140],"computational":[141],"overload,":[142],"implement":[144],"dynamic":[146],"scheduling":[147],"mechanism":[148],"iteratively":[150],"processes":[151],"sequances-query":[152],"prompts,":[153],"leveraging":[154],"VLM's":[155],"cross-modal":[156],"capabilities":[158],"identify":[160],"textually":[161],"specified":[162],"objects.":[163],"Experiments":[164],"ScanRefer":[167],"Nr3D":[169],"benchmarks":[170],"demonstrate":[171],"state-of-the-art":[172],"performance,":[173],"achieving":[174],"[email":[175],"protected]":[176],"scores":[177],"55.6%":[179],"53.2%,":[181],"surpassing":[182],"previous":[183],"by":[186],"4.0%":[187],"5.2%,":[189],"respectively,":[190],"which":[191],"advance":[192],"toward":[194],"generalization":[196],"applicability.":[199]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-25T00:00:00"}
