{"id":"https://openalex.org/W4377372012","doi":"https://doi.org/10.48550/arxiv.2305.11768","title":"Generating Visual Spatial Description via Holistic 3D Scene Understanding","display_name":"Generating Visual Spatial Description via Holistic 3D Scene Understanding","publication_year":2023,"publication_date":"2023-05-19","ids":{"openalex":"https://openalex.org/W4377372012","doi":"https://doi.org/10.48550/arxiv.2305.11768"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2305.11768","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.11768","pdf_url":"https://arxiv.org/pdf/2305.11768","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2305.11768","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100701157","display_name":"Yu Zhao","orcid":"https://orcid.org/0000-0002-0606-4676"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhao, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019435679","display_name":"Fei Hao","orcid":"https://orcid.org/0000-0003-4942-0893"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fei, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101397422","display_name":"Wei Ji","orcid":"https://orcid.org/0000-0002-8106-9768"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027428789","display_name":"Jianguo Wei","orcid":"https://orcid.org/0000-0002-8964-9759"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Jianguo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004953265","display_name":"Meishan Zhang","orcid":"https://orcid.org/0000-0001-6335-1340"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Meishan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100402911","display_name":"Min Zhang","orcid":"https://orcid.org/0000-0002-3895-5510"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Min","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5089404640","display_name":"Tat\u2010Seng Chua","orcid":"https://orcid.org/0000-0001-6097-7807"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chua, Tat-Seng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100701157"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10757","display_name":"Geographic Information Systems Studies","score":0.9739999771118164,"subfield":{"id":"https://openalex.org/subfields/3305","display_name":"Geography, Planning and Development"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7817926406860352},{"id":"https://openalex.org/keywords/spatial-relation","display_name":"Spatial relation","score":0.6052534580230713},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5938804745674133},{"id":"https://openalex.org/keywords/scene-graph","display_name":"Scene graph","score":0.5901780128479004},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5419214367866516},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.530328631401062},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5301103591918945},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4907182455062866},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.43386948108673096},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3827867805957794},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.19365304708480835},{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.0797799825668335}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7817926406860352},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.6052534580230713},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5938804745674133},{"id":"https://openalex.org/C179372163","wikidata":"https://www.wikidata.org/wiki/Q1406181","display_name":"Scene graph","level":3,"score":0.5901780128479004},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5419214367866516},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.530328631401062},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5301103591918945},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4907182455062866},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.43386948108673096},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3827867805957794},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.19365304708480835},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.0797799825668335},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2305.11768","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.11768","pdf_url":"https://arxiv.org/pdf/2305.11768","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2305.11768","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2305.11768","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2305.11768","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.11768","pdf_url":"https://arxiv.org/pdf/2305.11768","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4377372012.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4214591816","https://openalex.org/W4389471147","https://openalex.org/W4312650216","https://openalex.org/W4387129494","https://openalex.org/W4221155265","https://openalex.org/W3042849301","https://openalex.org/W3109142545","https://openalex.org/W4385571298","https://openalex.org/W2590022237","https://openalex.org/W4390195594"],"abstract_inverted_index":{"Visual":[0],"spatial":[1,11,38,82,91,149],"description":[2],"(VSD)":[3],"aims":[4],"to":[5,33,122],"generate":[6],"texts":[7],"that":[8,87,134],"describe":[9],"the":[10,14,24,34,48,64,90,97,115,138,144],"relations":[12],"of":[13,36,40,50,93],"given":[15],"objects":[16,66,95],"within":[17,96],"images.":[18],"Existing":[19],"VSD":[20,131],"work":[21],"merely":[22],"models":[23],"2D":[25],"geometrical":[26],"vision":[27],"features,":[28],"thus":[29],"inevitably":[30],"falling":[31],"prey":[32],"problem":[35],"skewed":[37],"understanding":[39],"target":[41,79,94],"objects.":[42],"In":[43],"this":[44],"work,":[45],"we":[46,62,76,88,102],"investigate":[47],"incorporation":[49],"3D":[51,59,65,81,99],"scene":[52,60,68,83,105],"features":[53,69,119],"for":[54,70],"VSD.":[55],"With":[56],"an":[57],"external":[58],"extractor,":[61],"obtain":[63],"and":[67],"input":[71],"images,":[72],"based":[73],"on":[74,129,143],"which":[75],"construct":[77],"a":[78,104],"object-centered":[80],"graph":[84],"(Go3D-S2G),":[85],"such":[86],"model":[89],"semantics":[92],"holistic":[98],"scenes.":[100],"Besides,":[101],"propose":[103],"subgraph":[106],"selecting":[107],"mechanism,":[108],"sampling":[109],"topologically-diverse":[110],"subgraphs":[111],"from":[112],"Go3D-S2G,":[113],"where":[114],"diverse":[116],"local":[117],"structure":[118],"are":[120],"navigated":[121],"yield":[123],"spatially-diversified":[124,157],"text":[125],"generation.":[126,158],"Experimental":[127],"results":[128],"two":[130],"datasets":[132],"demonstrate":[133],"our":[135,152],"framework":[136],"outperforms":[137],"baselines":[139],"significantly,":[140],"especially":[141],"improving":[142],"cases":[145],"with":[146],"complex":[147],"visual":[148],"relations.":[150],"Meanwhile,":[151],"method":[153],"can":[154],"produce":[155],"more":[156],"Code":[159],"is":[160],"available":[161],"at":[162],"https://github.com/zhaoyucs/VSD.":[163]},"counts_by_year":[{"year":2024,"cited_by_count":7}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2023-05-23T00:00:00"}
