{"id":"https://openalex.org/W7151040226","doi":"https://doi.org/10.48550/arxiv.2604.02546","title":"Contrastive Language-Colored Pointmap Pretraining for Unified 3D Scene Understanding","display_name":"Contrastive Language-Colored Pointmap Pretraining for Unified 3D Scene Understanding","publication_year":2026,"publication_date":"2026-04-02","ids":{"openalex":"https://openalex.org/W7151040226","doi":"https://doi.org/10.48550/arxiv.2604.02546"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.02546","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02546","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.02546","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132555034","display_name":"Ye Mao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mao, Ye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116143562","display_name":"Weixun Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Weixun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133031508","display_name":"Ranran Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Ranran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002320772","display_name":"Junpeng Jing","orcid":"https://orcid.org/0000-0001-5669-8573"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jing, Junpeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5024769212","display_name":"Krystian Mikolajczyk","orcid":"https://orcid.org/0000-0003-0726-9187"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mikolajczyk, Krystian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.2847000062465668,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.2847000062465668,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1657000035047531,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.10000000149011612,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5652999877929688},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4805999994277954},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4458000063896179},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.35929998755455017},{"id":"https://openalex.org/keywords/colored","display_name":"Colored","score":0.3553999960422516},{"id":"https://openalex.org/keywords/3d-model","display_name":"3d model","score":0.31299999356269836}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7324000000953674},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6967999935150146},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6111000180244446},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5652999877929688},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4805999994277954},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4458000063896179},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.35929998755455017},{"id":"https://openalex.org/C2778307483","wikidata":"https://www.wikidata.org/wiki/Q5149038","display_name":"Colored","level":2,"score":0.3553999960422516},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3393999934196472},{"id":"https://openalex.org/C3019007443","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3d model","level":2,"score":0.31299999356269836},{"id":"https://openalex.org/C2777299769","wikidata":"https://www.wikidata.org/wiki/Q3707858","display_name":"Type (biology)","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C197654239","wikidata":"https://www.wikidata.org/wiki/Q7430757","display_name":"Scene statistics","level":3,"score":0.26409998536109924},{"id":"https://openalex.org/C3004257","wikidata":"https://www.wikidata.org/wiki/Q17084606","display_name":"Correspondence problem","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C79106606","wikidata":"https://www.wikidata.org/wiki/Q735197","display_name":"Afterimage","level":3,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.02546","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02546","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.02546","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02546","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.4246378540992737}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Pretraining":[0,9],"3D":[1,22,87,103],"encoders":[2],"by":[3],"aligning":[4],"with":[5],"Contrastive":[6],"Language":[7],"Image":[8],"(CLIP)":[10],"has":[11],"emerged":[12],"as":[13],"a":[14,31],"promising":[15],"direction":[16],"to":[17,65],"learn":[18],"generalizable":[19],"representations":[20,38],"for":[21,101],"scene":[23,37,81,83,104],"understanding.":[24,105],"In":[25],"this":[26],"paper,":[27],"we":[28,55],"propose":[29],"UniScene3D,":[30],"transformer-based":[32],"encoder":[33],"that":[34],"learns":[35],"unified":[36,102],"from":[39],"multi-view":[40],"colored":[41,51],"pointmaps,":[42],"jointly":[43],"modeling":[44],"image":[45],"appearance":[46],"and":[47,61,69,74,86],"geometry.":[48],"For":[49],"robust":[50],"pointmap":[52],"representation":[53],"learning,":[54],"introduce":[56],"novel":[57],"cross-view":[58,67],"geometric":[59],"alignment":[60,64],"grounded":[62],"view":[63],"enforce":[66],"geometry":[68],"semantic":[70],"consistency.":[71],"Extensive":[72],"low-shot":[73],"task-specific":[75],"fine-tuning":[76],"evaluations":[77],"on":[78],"viewpoint":[79],"grounding,":[80],"retrieval,":[82],"type":[84],"classification,":[85],"VQA":[88],"demonstrate":[89],"our":[90,99],"state-of-the-art":[91],"performance.":[92],"These":[93],"results":[94],"highlight":[95],"the":[96],"effectiveness":[97],"of":[98],"approach":[100],"https://yebulabula.github.io/UniScene3D/":[106]},"counts_by_year":[],"updated_date":"2026-07-01T08:55:40.977307","created_date":"2026-04-07T00:00:00"}
