{"id":"https://openalex.org/W7164843861","doi":"https://doi.org/10.1145/3805622.3810857","title":"OpenSGG-VL: Open-Vocabulary 3DSGG with Orthogonal Residual Fusion and Iterative Relation Refinement","display_name":"OpenSGG-VL: Open-Vocabulary 3DSGG with Orthogonal Residual Fusion and Iterative Relation Refinement","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164843861","doi":"https://doi.org/10.1145/3805622.3810857"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810857","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810857","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810857","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5138676433","display_name":"Binbin Zhang","orcid":"https://orcid.org/0009-0003-9634-5213"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Binbin Zhang","raw_affiliation_strings":["Nanjing University of Science and Technology, Nan Jing, China"],"raw_orcid":"https://orcid.org/0009-0003-9634-5213","affiliations":[{"raw_affiliation_string":"Nanjing University of Science and Technology, Nan Jing, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138647969","display_name":"Fang Zhou","orcid":"https://orcid.org/0009-0004-9485-8311"},"institutions":[{"id":"https://openalex.org/I4210134393","display_name":"China National Chemical Engineering (China)","ror":"https://ror.org/04pwn6a43","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210134393"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fang Zhou","raw_affiliation_strings":["National Key Laboratory of Information Systems Engineering, Nan Jing, China"],"raw_orcid":"https://orcid.org/0009-0004-9485-8311","affiliations":[{"raw_affiliation_string":"National Key Laboratory of Information Systems Engineering, Nan Jing, China","institution_ids":["https://openalex.org/I4210134393"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020302879","display_name":"Liang Xiao","orcid":"https://orcid.org/0000-0003-0178-9384"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang Xiao","raw_affiliation_strings":["Nanjing University of Science and Technology, Nan Jing, China"],"raw_orcid":"https://orcid.org/0000-0003-0178-9384","affiliations":[{"raw_affiliation_string":"Nanjing University of Science and Technology, Nan Jing, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086331195","display_name":"Zhiyong Su","orcid":"https://orcid.org/0000-0001-9483-5268"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyong Su","raw_affiliation_strings":["Nanjing University of Science and Technology, Nan Jing, China"],"raw_orcid":"https://orcid.org/0000-0001-9483-5268","affiliations":[{"raw_affiliation_string":"Nanjing University of Science and Technology, Nan Jing, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101414940","display_name":"Weiqing Li","orcid":"https://orcid.org/0000-0002-1929-3654"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiqing Li","raw_affiliation_strings":["Nanjing University of Science and Technology, Nan Jing, China"],"raw_orcid":"https://orcid.org/0000-0002-1929-3654","affiliations":[{"raw_affiliation_string":"Nanjing University of Science and Technology, Nan Jing, China","institution_ids":["https://openalex.org/I36399199"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93841149,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"807","last_page":"816"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8274000287055969,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8274000287055969,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.04439999908208847,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.027699999511241913,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/relation","display_name":"Relation (database)","score":0.7497000098228455},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.6074000000953674},{"id":"https://openalex.org/keywords/rgb-color-model","display_name":"RGB color model","score":0.49059998989105225},{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.4603999853134155},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4221000075340271},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4169999957084656},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.4165000021457672},{"id":"https://openalex.org/keywords/property","display_name":"Property (philosophy)","score":0.4154999852180481},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4115000069141388}],"concepts":[{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.7497000098228455},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.6074000000953674},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5537999868392944},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5464000105857849},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.49059998989105225},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.48399999737739563},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.4603999853134155},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4221000075340271},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4169999957084656},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.4165000021457672},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.4154999852180481},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4115000069141388},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.40209999680519104},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.392300009727478},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.3898000121116638},{"id":"https://openalex.org/C156103551","wikidata":"https://www.wikidata.org/wiki/Q130998","display_name":"Equivalence relation","level":2,"score":0.38850000500679016},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.366100013256073},{"id":"https://openalex.org/C143587482","wikidata":"https://www.wikidata.org/wiki/Q1543216","display_name":"Iterative and incremental development","level":2,"score":0.3562999963760376},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34209999442100525},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3230000138282776},{"id":"https://openalex.org/C159694833","wikidata":"https://www.wikidata.org/wiki/Q2321565","display_name":"Iterative method","level":2,"score":0.31779998540878296},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.31380000710487366},{"id":"https://openalex.org/C17137986","wikidata":"https://www.wikidata.org/wiki/Q215067","display_name":"Orthogonality","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.29840001463890076},{"id":"https://openalex.org/C164660894","wikidata":"https://www.wikidata.org/wiki/Q2037833","display_name":"Piecewise","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C2779982483","wikidata":"https://www.wikidata.org/wiki/Q6094420","display_name":"Iterative refinement","level":2,"score":0.2906000018119812},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2888000011444092},{"id":"https://openalex.org/C2780069185","wikidata":"https://www.wikidata.org/wiki/Q7977945","display_name":"Equivalence (formal languages)","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C179372163","wikidata":"https://www.wikidata.org/wiki/Q1406181","display_name":"Scene graph","level":3,"score":0.26109999418258667},{"id":"https://openalex.org/C17231256","wikidata":"https://www.wikidata.org/wiki/Q5156540","display_name":"Completeness (order theory)","level":2,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810857","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810857","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810857","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810857","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.4812639057636261,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2479423890","https://openalex.org/W2963319519","https://openalex.org/W2981422075","https://openalex.org/W3025468529","https://openalex.org/W3035154952","https://openalex.org/W3114942877","https://openalex.org/W3173271937","https://openalex.org/W3183042936","https://openalex.org/W4214755140","https://openalex.org/W4386065742","https://openalex.org/W4386076015","https://openalex.org/W4386076041","https://openalex.org/W4386076183","https://openalex.org/W4386083046","https://openalex.org/W4390873101","https://openalex.org/W4394625817","https://openalex.org/W4394631207","https://openalex.org/W4401414715","https://openalex.org/W4401415891","https://openalex.org/W4401634890","https://openalex.org/W4401990649","https://openalex.org/W4402354022","https://openalex.org/W4402704531","https://openalex.org/W4402726938","https://openalex.org/W4402915938","https://openalex.org/W4413147725","https://openalex.org/W7133185086","https://openalex.org/W7133211372","https://openalex.org/W7133224126"],"related_works":[],"abstract_inverted_index":{"Open-vocabulary":[0],"3D":[1,15,38,61,92],"scene":[2,150],"graph":[3],"generation":[4],"(3DSGG)":[5],"aims":[6],"to":[7,36,75],"predict":[8],"object":[9],"categories":[10],"and":[11,46,88,118,132,146],"relation":[12,112,120,130],"triplets":[13],"from":[14,85],"scans":[16],"while":[17,105],"generalizing":[18],"beyond":[19],"a":[20,56],"fixed":[21],"label":[22],"set.":[23],"Prior":[24],"open-vocabulary":[25,144],"methods":[26],"commonly":[27],"rely":[28],"on":[29,136],"2D":[30,83],"features":[31,74],"as":[32,122],"an":[33,116,123],"intermediate":[34],"bridge":[35],"connect":[37],"representations":[39],"with":[40,91,128],"language,":[41],"which":[42,99],"can":[43],"introduce":[44],"misalignment":[45],"unstable":[47],"multimodal":[48],"fusion.":[49],"In":[50],"this":[51],"work,":[52],"we":[53,81,114],"propose":[54],"OpenSGG-VL,":[55],"framework":[57],"that":[58,139],"learns":[59],"text-aligned":[60],"instance":[62],"embeddings":[63,84,93],"via":[64],"large-scale":[65],"3D-Text":[66],"contrastive":[67],"learning,":[68],"further":[69],"enhanced":[70],"by":[71],"lightweight":[72],"pose":[73],"retain":[76],"spatial":[77],"cues.":[78],"At":[79],"inference,":[80],"extract":[82],"RGB":[86],"views":[87],"fuse":[89],"them":[90],"using":[94],"Orthogonal":[95],"Residual":[96],"Fusion":[97],"(ORF),":[98],"preserves":[100],"the":[101],"dominant":[102],"semantic":[103],"direction":[104],"injecting":[106],"complementary":[107],"geometric":[108],"residuals.":[109],"For":[110],"open-set":[111],"prediction,":[113],"employ":[115],"LLM":[117],"formulate":[119],"inference":[121],"iterative,":[124],"scene-consistent":[125],"refinement":[126],"process":[127],"grouped":[129],"decoding":[131],"multi-stage":[133],"optimization.":[134],"Experiments":[135],"3DSSG":[137],"demonstrate":[138],"our":[140],"method":[141],"achieves":[142],"strong":[143],"performance":[145],"produces":[147],"more":[148],"coherent":[149],"graphs":[151],"than":[152],"prior":[153],"baselines.":[154]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
