{"id":"https://openalex.org/W4413462458","doi":"https://doi.org/10.1109/tmm.2025.3599032","title":"Weakly-Supervised 3D Visual Grounding Based on Visual Language Alignment","display_name":"Weakly-Supervised 3D Visual Grounding Based on Visual Language Alignment","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4413462458","doi":"https://doi.org/10.1109/tmm.2025.3599032"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2025.3599032","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3599032","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100591441","display_name":"Xiaoxu Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I180726961","display_name":"Shenzhen University","ror":"https://ror.org/01vy4gh70","country_code":"CN","type":"education","lineage":["https://openalex.org/I180726961"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiaoxu Xu","raw_affiliation_strings":["College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China","institution_ids":["https://openalex.org/I180726961"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011133696","display_name":"Yitian Yuan","orcid":"https://orcid.org/0000-0001-8701-7689"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yitian Yuan","raw_affiliation_strings":["Meituan Inc., Beijing, China","Meituan Inc., China"],"affiliations":[{"raw_affiliation_string":"Meituan Inc., Beijing, China","institution_ids":[]},{"raw_affiliation_string":"Meituan Inc., China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055435177","display_name":"Qiudan Zhang","orcid":"https://orcid.org/0000-0001-6067-8188"},"institutions":[{"id":"https://openalex.org/I180726961","display_name":"Shenzhen University","ror":"https://ror.org/01vy4gh70","country_code":"CN","type":"education","lineage":["https://openalex.org/I180726961"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiudan Zhang","raw_affiliation_strings":["College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China","institution_ids":["https://openalex.org/I180726961"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081114494","display_name":"Wenhui Wu","orcid":"https://orcid.org/0000-0002-0416-7719"},"institutions":[{"id":"https://openalex.org/I180726961","display_name":"Shenzhen University","ror":"https://ror.org/01vy4gh70","country_code":"CN","type":"education","lineage":["https://openalex.org/I180726961"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenhui Wu","raw_affiliation_strings":["College of Electronics and Information Engineering, Shenzhen University, Shenzhen, China","College of Electronics and Information Engineering, Shenzhen University, China"],"affiliations":[{"raw_affiliation_string":"College of Electronics and Information Engineering, Shenzhen University, Shenzhen, China","institution_ids":["https://openalex.org/I180726961"]},{"raw_affiliation_string":"College of Electronics and Information Engineering, Shenzhen University, China","institution_ids":["https://openalex.org/I180726961"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075329194","display_name":"Zequn Jie","orcid":"https://orcid.org/0000-0002-3038-5891"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zequn Jie","raw_affiliation_strings":["Meituan Inc., Beijing, China","Meituan Inc., China"],"affiliations":[{"raw_affiliation_string":"Meituan Inc., Beijing, China","institution_ids":[]},{"raw_affiliation_string":"Meituan Inc., China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Lin Ma","orcid":"https://orcid.org/0000-0002-7331-6132"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin Ma","raw_affiliation_strings":["Meituan Inc., Beijing, China","Meituan Inc., China"],"affiliations":[{"raw_affiliation_string":"Meituan Inc., Beijing, China","institution_ids":[]},{"raw_affiliation_string":"Meituan Inc., China","institution_ids":[]}]},{"author_position":"last","author":{"id":null,"display_name":"Xu Wang","orcid":"https://orcid.org/0000-0002-2948-6468"},"institutions":[{"id":"https://openalex.org/I180726961","display_name":"Shenzhen University","ror":"https://ror.org/01vy4gh70","country_code":"CN","type":"education","lineage":["https://openalex.org/I180726961"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xu Wang","raw_affiliation_strings":["College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China","institution_ids":["https://openalex.org/I180726961"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100591441"],"corresponding_institution_ids":["https://openalex.org/I180726961"],"apc_list":null,"apc_paid":null,"fwci":1.2181,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.82751546,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"27","issue":null,"first_page":"7662","last_page":"7674"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9894999861717224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9894999861717224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9853000044822693,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9732000231742859,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8511502742767334},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5784732699394226},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5022330284118652},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.46833181381225586},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4669676125049591},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.4178350865840912},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3779618740081787}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8511502742767334},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5784732699394226},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5022330284118652},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.46833181381225586},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4669676125049591},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.4178350865840912},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3779618740081787},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3599032","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3599032","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2239077016","display_name":null,"funder_award_id":"62376162","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5281037491","display_name":null,"funder_award_id":"62371310","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7771545035","display_name":null,"funder_award_id":"2023A1515011236","funder_id":"https://openalex.org/F4320337111","funder_display_name":"Basic and Applied Basic Research Foundation of Guangdong Province"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320337111","display_name":"Basic and Applied Basic Research Foundation of Guangdong Province","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":46,"referenced_works":["https://openalex.org/W2247513039","https://openalex.org/W2594519801","https://openalex.org/W2798990097","https://openalex.org/W2799263800","https://openalex.org/W2963445828","https://openalex.org/W2963835840","https://openalex.org/W2986670728","https://openalex.org/W2987401211","https://openalex.org/W2989176720","https://openalex.org/W3012255272","https://openalex.org/W3034949383","https://openalex.org/W3035497460","https://openalex.org/W3095974555","https://openalex.org/W3100393531","https://openalex.org/W3107521863","https://openalex.org/W3117585461","https://openalex.org/W3133833192","https://openalex.org/W3140398265","https://openalex.org/W3151130473","https://openalex.org/W3175234951","https://openalex.org/W3178418424","https://openalex.org/W3179041377","https://openalex.org/W3203949114","https://openalex.org/W3206171352","https://openalex.org/W3213106580","https://openalex.org/W4214490042","https://openalex.org/W4214684415","https://openalex.org/W4285414077","https://openalex.org/W4312565984","https://openalex.org/W4312749817","https://openalex.org/W4312852845","https://openalex.org/W4313145013","https://openalex.org/W4313162371","https://openalex.org/W4316661142","https://openalex.org/W4319299680","https://openalex.org/W4385245566","https://openalex.org/W4386065742","https://openalex.org/W4386075583","https://openalex.org/W4387272106","https://openalex.org/W4390872495","https://openalex.org/W4390872744","https://openalex.org/W4390873695","https://openalex.org/W4390874439","https://openalex.org/W4401416853","https://openalex.org/W4402778219","https://openalex.org/W4403888405"],"related_works":["https://openalex.org/W2021787609","https://openalex.org/W2068608913","https://openalex.org/W1537063595","https://openalex.org/W2097328689","https://openalex.org/W4234899305","https://openalex.org/W2379604501","https://openalex.org/W3124914020","https://openalex.org/W2373854414","https://openalex.org/W2574906695","https://openalex.org/W2522183581"],"abstract_inverted_index":{"Learning":[0],"to":[1,6,44,147,167],"ground":[2,143],"natural":[3],"language":[4],"queries":[5,146],"target":[7,150],"objects":[8,151],"or":[9],"regions":[10],"in":[11,128,172],"3D":[12,19,24,107,118,149,169],"point":[13,108,119],"clouds":[14,120],"is":[15,40,163],"quite":[16],"essential":[17],"for":[18,36,58,124],"scene":[20],"understanding.":[21],"Nevertheless,":[22],"existing":[23,101],"visual":[25,62,170],"grounding":[26,63,171],"approaches":[27],"require":[28],"a":[29,54,173],"substantial":[30],"number":[31],"of":[32,81,159],"bounding":[33],"box":[34,126],"annotations":[35,127],"text":[37,145],"queries,":[38],"which":[39],"time-consuming":[41],"and":[42,93,106,110,117,183,188,197],"labor-intensive":[43],"obtain.":[45],"In":[46],"this":[47,162],"paper,":[48],"we":[49],"propose":[50],"<bold":[51,59,66,69,72],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[52,60,67,70,73],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">3D-VLA</b>,":[53],"weakly":[55,174],"supervised":[56,175,204],"approach":[57],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">3D</b>":[61],"based":[64],"on":[65,87,186],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">V</b>isual":[68],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">L</b>anguage":[71],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">A</b>lignment.":[74],"Our":[75],"3D-VLA":[76,194],"exploits":[77],"the":[78,89,99,129,133,136,144,148,157,164,202],"superior":[79,199],"ability":[80],"current":[82],"large-scale":[83],"vision-language":[84,181],"models":[85],"(VLMs)":[86],"aligning":[88],"semantics":[90],"between":[91,103,115],"texts":[92,116],"2D":[94,104,154],"images,":[95],"as":[96,98],"well":[97],"naturally":[100],"correspondences":[102,114],"images":[105],"clouds,":[109],"thus":[111],"implicitly":[112],"constructs":[113],"with":[121],"no":[122],"need":[123],"fine-grained":[125],"training":[130],"procedure.":[131],"During":[132],"inference":[134],"stage,":[135],"learned":[137],"text-3D":[138],"correspondence":[139],"will":[140],"help":[141],"us":[142],"even":[152,198],"without":[153],"images.":[155],"To":[156],"best":[158],"our":[160,193],"knowledge,":[161],"first":[165],"work":[166],"investigate":[168],"manner":[176],"by":[177],"involving":[178],"large":[179],"scale":[180],"models,":[182],"extensive":[184],"experiments":[185],"ReferIt3D":[187],"ScanRefer":[189],"datasets":[190],"demonstrate":[191],"that":[192],"achieves":[195],"comparable":[196],"results":[200],"over":[201],"fully":[203],"methods.":[205]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
