{"id":"https://openalex.org/W4403524638","doi":"https://doi.org/10.48550/arxiv.2409.03757","title":"Lexicon3D: Probing Visual Foundation Models for Complex 3D Scene Understanding","display_name":"Lexicon3D: Probing Visual Foundation Models for Complex 3D Scene Understanding","publication_year":2024,"publication_date":"2024-09-05","ids":{"openalex":"https://openalex.org/W4403524638","doi":"https://doi.org/10.48550/arxiv.2409.03757"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2409.03757","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.03757","pdf_url":"https://arxiv.org/pdf/2409.03757","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2409.03757","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010773823","display_name":"Yunze Man","orcid":"https://orcid.org/0000-0002-2357-2883"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Man, Yunze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075364910","display_name":"Shuhong Zheng","orcid":"https://orcid.org/0000-0003-3160-9532"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Shuhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035492421","display_name":"Zhipeng Bao","orcid":"https://orcid.org/0000-0001-8376-1195"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bao, Zhipeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075246991","display_name":"Martial Hebert","orcid":"https://orcid.org/0000-0003-4566-5930"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hebert, Martial","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016962709","display_name":"Liang-Yan Gui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gui, Liang-Yan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5049973776","display_name":"Yu-Xiong Wang","orcid":"https://orcid.org/0000-0003-4414-0198"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yu-Xiong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5010773823"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.9757000207901001,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.9757000207901001,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9692999720573425,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11211","display_name":"3D Surveying and Cultural Heritage","score":0.9678999781608582,"subfield":{"id":"https://openalex.org/subfields/1907","display_name":"Geology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.8715416193008423},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4561850130558014},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.41217195987701416},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.37995025515556335},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.22450095415115356},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.20755049586296082},{"id":"https://openalex.org/keywords/archaeology","display_name":"Archaeology","score":0.12110841274261475}],"concepts":[{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.8715416193008423},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4561850130558014},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.41217195987701416},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37995025515556335},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.22450095415115356},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.20755049586296082},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.12110841274261475}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2409.03757","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.03757","pdf_url":"https://arxiv.org/pdf/2409.03757","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2409.03757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2409.03757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2409.03757","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.03757","pdf_url":"https://arxiv.org/pdf/2409.03757","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2926720356","display_name":null,"funder_award_id":"32799","funder_id":"https://openalex.org/F4320332299","funder_display_name":"National Institute of Food and Agriculture"},{"id":"https://openalex.org/G4801515358","display_name":null,"funder_award_id":"ACCESS","funder_id":"https://openalex.org/F4320337377","funder_display_name":"Office of Advanced Cyberinfrastructure"},{"id":"https://openalex.org/G539587745","display_name":null,"funder_award_id":"2106825","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5620962805","display_name":null,"funder_award_id":"67021","funder_id":"https://openalex.org/F4320332299","funder_display_name":"National Institute of Food and Agriculture"},{"id":"https://openalex.org/G6770826516","display_name":null,"funder_award_id":"2020-67021-32799","funder_id":"https://openalex.org/F4320332299","funder_display_name":"National Institute of Food and Agriculture"},{"id":"https://openalex.org/G848032724","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320312143","display_name":"National Centre for Supercomputing Applications","ror":"https://ror.org/03r10zj06"},{"id":"https://openalex.org/F4320315934","display_name":"Toyota Research Institute","ror":null},{"id":"https://openalex.org/F4320332299","display_name":"National Institute of Food and Agriculture","ror":"https://ror.org/05qx3fv49"},{"id":"https://openalex.org/F4320337377","display_name":"Office of Advanced Cyberinfrastructure","ror":"https://ror.org/04nh1dc89"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4403524638.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2381393187","https://openalex.org/W2332779545","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W2358060160","https://openalex.org/W2035483685"],"abstract_inverted_index":{"Complex":[0],"3D":[1,52,77],"scene":[2,9,22,53,101],"understanding":[3],"has":[4],"gained":[5],"increasing":[6],"attention,":[7],"with":[8],"encoding":[10,23,49],"strategies":[11,24],"playing":[12],"a":[13,42],"crucial":[14],"role":[15],"in":[16,84,115,129,155],"this":[17,38],"success.":[18],"However,":[19],"the":[20,56,148],"optimal":[21],"for":[25,51,150],"various":[26,47],"scenarios":[27],"remain":[28],"unclear,":[29],"particularly":[30],"compared":[31],"to":[32],"their":[33],"image-based":[34],"counterparts.":[35],"To":[36],"address":[37],"issue,":[39],"we":[40],"present":[41],"comprehensive":[43],"study":[44],"that":[45],"probes":[46],"visual":[48,143],"models":[50,83,113,119,125],"understanding,":[54],"identifying":[55],"strengths":[57],"and":[58,76,93,123,146,158],"limitations":[59,128],"of":[60,100],"each":[61,95],"model":[62],"across":[63],"different":[64,98],"scenarios.":[65],"Our":[66,103],"evaluation":[67],"spans":[68],"seven":[69],"vision":[70],"foundation":[71,78,144],"encoders,":[72],"including":[73],"image-based,":[74],"video-based,":[75],"models.":[79],"We":[80],"evaluate":[81],"these":[82],"four":[85],"tasks:":[86],"Vision-Language":[87],"Scene":[88],"Reasoning,":[89],"Visual":[90],"Grounding,":[91],"Segmentation,":[92],"Registration,":[94],"focusing":[96],"on":[97,141],"aspects":[99],"understanding.":[102],"evaluations":[104],"yield":[105],"key":[106],"findings:":[107],"DINOv2":[108],"demonstrates":[109],"superior":[110],"performance,":[111],"video":[112],"excel":[114],"object-level":[116],"tasks,":[117,122],"diffusion":[118],"benefit":[120],"geometric":[121],"language-pretrained":[124],"show":[126],"unexpected":[127],"language-related":[130],"tasks.":[131,160],"These":[132],"insights":[133],"challenge":[134],"some":[135],"conventional":[136],"understandings,":[137],"provide":[138],"novel":[139],"perspectives":[140],"leveraging":[142],"models,":[145],"highlight":[147],"need":[149],"more":[151],"flexible":[152],"encoder":[153],"selection":[154],"future":[156],"vision-language":[157],"scene-understanding":[159],"Code:":[161],"https://github.com/YunzeMan/Lexicon3D":[162]},"counts_by_year":[],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
