{"id":"https://openalex.org/W7129354208","doi":"https://doi.org/10.1109/icipw68931.2025.11386418","title":"Closing the Modality Gap: Integrating LLMs With Lidar For 3D Object Detection and Object-Level Understanding","display_name":"Closing the Modality Gap: Integrating LLMs With Lidar For 3D Object Detection and Object-Level Understanding","publication_year":2025,"publication_date":"2025-09-14","ids":{"openalex":"https://openalex.org/W7129354208","doi":"https://doi.org/10.1109/icipw68931.2025.11386418"},"language":null,"primary_location":{"id":"doi:10.1109/icipw68931.2025.11386418","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icipw68931.2025.11386418","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Image Processing Workshops (ICIPW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126200816","display_name":"Youngchae Chee","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Youngchae Chee","raw_affiliation_strings":["KAIST,School of Electrical Engineering,South Korea"],"affiliations":[{"raw_affiliation_string":"KAIST,School of Electrical Engineering,South Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108865173","display_name":"Taeheon Kim","orcid":"https://orcid.org/0000-0001-6334-3025"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Taeheon Kim","raw_affiliation_strings":["KAIST,School of Electrical Engineering,South Korea"],"affiliations":[{"raw_affiliation_string":"KAIST,School of Electrical Engineering,South Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126210047","display_name":"Youngjoon Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Youngjoon Yu","raw_affiliation_strings":["KAIST,School of Electrical Engineering,South Korea"],"affiliations":[{"raw_affiliation_string":"KAIST,School of Electrical Engineering,South Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108627129","display_name":"Hyun Wook Park","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hyun Wook Park","raw_affiliation_strings":["KAIST,School of Electrical Engineering,South Korea"],"affiliations":[{"raw_affiliation_string":"KAIST,School of Electrical Engineering,South Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124190178","display_name":"Yong Man Ro","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Yong Man Ro","raw_affiliation_strings":["KAIST,School of Electrical Engineering,South Korea"],"affiliations":[{"raw_affiliation_string":"KAIST,School of Electrical Engineering,South Korea","institution_ids":["https://openalex.org/I157485424"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5126200816"],"corresponding_institution_ids":["https://openalex.org/I157485424"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.75318659,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"528","last_page":"533"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7468000054359436,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7468000054359436,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.155799999833107,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.009399999864399433,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/lidar","display_name":"Lidar","score":0.9090999960899353},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.6122999787330627},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5613999962806702},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.54339998960495},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5045999884605408},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.48190000653266907},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4514000117778778}],"concepts":[{"id":"https://openalex.org/C51399673","wikidata":"https://www.wikidata.org/wiki/Q504027","display_name":"Lidar","level":2,"score":0.9090999960899353},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7595000267028809},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.6122999787330627},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5782999992370605},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5613999962806702},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.54339998960495},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5045999884605408},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.48190000653266907},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4514000117778778},{"id":"https://openalex.org/C2778775528","wikidata":"https://www.wikidata.org/wiki/Q5135432","display_name":"Closing (real estate)","level":2,"score":0.4424999952316284},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4325999915599823},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.3919000029563904},{"id":"https://openalex.org/C62649853","wikidata":"https://www.wikidata.org/wiki/Q199687","display_name":"Remote sensing","level":1,"score":0.33329999446868896},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.3140000104904175},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2906000018119812},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.275299996137619},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icipw68931.2025.11386418","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icipw68931.2025.11386418","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Image Processing Workshops (ICIPW)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5628982186317444}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2296228853","https://openalex.org/W2968296999","https://openalex.org/W3035574168","https://openalex.org/W4403081598","https://openalex.org/W4403842477","https://openalex.org/W4404782234","https://openalex.org/W4409366009","https://openalex.org/W4415798919","https://openalex.org/W4416707474"],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1,16],"Language":[2,17],"Models":[3,18],"(MLLMs),":[4],"which":[5],"integrate":[6],"multiple":[7],"modalities":[8],"such":[9,163],"as":[10,164],"vision":[11],"and":[12,36,64,85,107,152,160,167],"language":[13,114],"by":[14,59],"leveraging":[15],"(LLMs),":[19],"have":[20],"demonstrated":[21],"remarkable":[22],"progress":[23],"in":[24,149,158],"multimodal":[25,39,86],"understanding.":[26],"However,":[27],"the":[28,117,136,154],"incorporation":[29],"of":[30,82,156],"LiDAR\u2014a":[31],"key":[32],"modality":[33],"for":[34],"spatial":[35],"geometric":[37],"comprehension\u2014into":[38],"LLMs":[40],"(MLLMs)":[41],"remains":[42],"underdeveloped.":[43],"In":[44],"this":[45,142],"work,":[46],"we":[47,95],"devise":[48],"a":[49],"novel":[50],"framework":[51],"that":[52],"enhances":[53],"MLLMs":[54,157],"with":[55,69,88,121],"object-level":[56,83,93,146],"LiDAR":[57,67,78,105,110,140],"comprehension":[58],"jointly":[60],"aligning":[61],"both":[62],"scene-level":[63,77],"region-of-interest":[65],"(ROI)":[66],"features":[68],"natural":[70,113],"language.":[71,89],"Our":[72],"method":[73],"includes":[74],"three":[75],"components:":[76],"compression,":[79],"ROI-based":[80],"extraction":[81],"features,":[84],"alignment":[87],"To":[90],"support":[91],"accurate":[92],"understanding,":[94],"propose":[96],"two":[97],"complementary":[98],"tasks:":[99],"LiDAR-to-Class":[100],"(predicting":[101],"object":[102],"classes":[103],"from":[104,112],"features)":[106],"Class-to-LiDAR":[108,137],"(identifying":[109],"objects":[111],"descriptions).":[115],"Leveraging":[116,139],"large-scale":[118],"NuScenes":[119],"dataset":[120],"over":[122],"one":[123],"million":[124],"annotated":[125],"objects,":[126],"our":[127],"model":[128],"achieves":[129],"strong":[130],"performance,":[131],"including":[132],"95%":[133],"accuracy":[134],"on":[135],"task.":[138],"data,":[141],"work":[143],"advances":[144],"precise":[145],"semantic":[147],"understanding":[148],"3D":[150],"environments":[151],"underscores":[153],"potential":[155],"safety-critical":[159],"interactive":[161],"scenarios":[162],"autonomous":[165],"driving":[166],"robotics.":[168]},"counts_by_year":[],"updated_date":"2026-02-23T20:09:44.859080","created_date":"2026-02-18T00:00:00"}
