{"id":"https://openalex.org/W4415821582","doi":"https://doi.org/10.1109/ro-man63969.2025.11217897","title":"LM-MCVT: A Lightweight Multi-Modal Multi-View Convolutional-Vision Transformer Approach for 3D Object Recognition","display_name":"LM-MCVT: A Lightweight Multi-Modal Multi-View Convolutional-Vision Transformer Approach for 3D Object Recognition","publication_year":2025,"publication_date":"2025-08-25","ids":{"openalex":"https://openalex.org/W4415821582","doi":"https://doi.org/10.1109/ro-man63969.2025.11217897"},"language":"en","primary_location":{"id":"doi:10.1109/ro-man63969.2025.11217897","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ro-man63969.2025.11217897","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th IEEE International Conference on Robot and Human Interactive Communication (RO-MAN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://research.rug.nl/en/publications/f83ede05-7125-4446-89e9-d78d07e9eaf7","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101187848","display_name":"Songsong Xiong","orcid":null},"institutions":[{"id":"https://openalex.org/I169381384","display_name":"University of Groningen","ror":"https://ror.org/012p63287","country_code":"NL","type":"education","lineage":["https://openalex.org/I169381384"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Songsong Xiong","raw_affiliation_strings":["University of Groningen,Department of Artificial Intelligence,Groningen,The Netherlands"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Groningen,Department of Artificial Intelligence,Groningen,The Netherlands","institution_ids":["https://openalex.org/I169381384"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5008655688","display_name":"Hamidreza Kasaei","orcid":"https://orcid.org/0000-0001-9408-7730"},"institutions":[{"id":"https://openalex.org/I169381384","display_name":"University of Groningen","ror":"https://ror.org/012p63287","country_code":"NL","type":"education","lineage":["https://openalex.org/I169381384"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Hamidreza Kasaei","raw_affiliation_strings":["University of Groningen,Department of Artificial Intelligence,Groningen,The Netherlands"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Groningen,Department of Artificial Intelligence,Groningen,The Netherlands","institution_ids":["https://openalex.org/I169381384"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5101187848"],"corresponding_institution_ids":["https://openalex.org/I169381384"],"apc_list":null,"apc_paid":null,"fwci":0.9573,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.8106399,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"141","last_page":"148"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.7860999703407288,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.7860999703407288,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.03999999910593033,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.025100000202655792,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6175000071525574},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5629000067710876},{"id":"https://openalex.org/keywords/cognitive-neuroscience-of-visual-object-recognition","display_name":"Cognitive neuroscience of visual object recognition","score":0.5583999752998352},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5455999970436096},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.5299000144004822},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.51910001039505},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.45019999146461487},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.4097999930381775}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7098000049591064},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6531999707221985},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6175000071525574},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5629000067710876},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.5583999752998352},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5455999970436096},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.5299000144004822},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.51910001039505},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.49570000171661377},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.45019999146461487},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.4097999930381775},{"id":"https://openalex.org/C31510193","wikidata":"https://www.wikidata.org/wiki/Q1192553","display_name":"Facial recognition system","level":3,"score":0.3977000117301941},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.38749998807907104},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3653999865055084},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.36480000615119934},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3278999924659729},{"id":"https://openalex.org/C14551309","wikidata":"https://www.wikidata.org/wiki/Q4636325","display_name":"3D single-object recognition","level":4,"score":0.3018999993801117},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.27549999952316284},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2653999924659729},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26179999113082886},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C52102323","wikidata":"https://www.wikidata.org/wiki/Q1671968","display_name":"Pose","level":2,"score":0.25679999589920044}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/ro-man63969.2025.11217897","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ro-man63969.2025.11217897","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th IEEE International Conference on Robot and Human Interactive Communication (RO-MAN)","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.rug.nl:openaire/f83ede05-7125-4446-89e9-d78d07e9eaf7","is_oa":true,"landing_page_url":"https://research.rug.nl/en/publications/f83ede05-7125-4446-89e9-d78d07e9eaf7","pdf_url":null,"source":{"id":"https://openalex.org/S4306400420","display_name":"University of Groningen research database (University of Groningen / Centre for Information Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I169381384","host_organization_name":"University of Groningen","host_organization_lineage":["https://openalex.org/I169381384"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Xiong, S & Kasaei, H 2025, LM-MCVT : A Lightweight Multi-Modal Multi-View Convolutional-Vision Transformer Approach for 3D Object Recognition. in 2025 34th IEEE International Conference on Robot and Human Interactive Communication, RO-MAN 2025. IEEE International Workshop on Robot and Human Communication, RO-MAN, IEEE Computer Society, pp. 141-148, 34th IEEE International Conference on Robot and Human Interactive Communication, RO-MAN 2025, Hybrid, Eindhoven, Netherlands, 25/08/2025. https://doi.org/10.1109/RO-MAN63969.2025.11217897","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:pure.rug.nl:publications/f83ede05-7125-4446-89e9-d78d07e9eaf7","is_oa":true,"landing_page_url":"https://hdl.handle.net/11370/f83ede05-7125-4446-89e9-d78d07e9eaf7","pdf_url":null,"source":{"id":"https://openalex.org/S4306400420","display_name":"University of Groningen research database (University of Groningen / Centre for Information Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I169381384","host_organization_name":"University of Groningen","host_organization_lineage":["https://openalex.org/I169381384"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Xiong, S & Kasaei, H 2025, LM-MCVT : A Lightweight Multi-Modal Multi-View Convolutional-Vision Transformer Approach for 3D Object Recognition. in 2025 34th IEEE International Conference on Robot and Human Interactive Communication, RO-MAN 2025. IEEE International Workshop on Robot and Human Communication, RO-MAN, IEEE Computer Society, pp. 141-148, 34th IEEE International Conference on Robot and Human Interactive Communication, RO-MAN 2025, Hybrid, Eindhoven, Netherlands, 25/08/2025. https://doi.org/10.1109/RO-MAN63969.2025.11217897","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"pmh:oai:pure.rug.nl:openaire/f83ede05-7125-4446-89e9-d78d07e9eaf7","is_oa":true,"landing_page_url":"https://research.rug.nl/en/publications/f83ede05-7125-4446-89e9-d78d07e9eaf7","pdf_url":null,"source":{"id":"https://openalex.org/S4306400420","display_name":"University of Groningen research database (University of Groningen / Centre for Information Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I169381384","host_organization_name":"University of Groningen","host_organization_lineage":["https://openalex.org/I169381384"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Xiong, S & Kasaei, H 2025, LM-MCVT : A Lightweight Multi-Modal Multi-View Convolutional-Vision Transformer Approach for 3D Object Recognition. in 2025 34th IEEE International Conference on Robot and Human Interactive Communication, RO-MAN 2025. IEEE International Workshop on Robot and Human Communication, RO-MAN, IEEE Computer Society, pp. 141-148, 34th IEEE International Conference on Robot and Human Interactive Communication, RO-MAN 2025, Hybrid, Eindhoven, Netherlands, 25/08/2025. https://doi.org/10.1109/RO-MAN63969.2025.11217897","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1644641054","https://openalex.org/W1920022804","https://openalex.org/W2211722331","https://openalex.org/W2789835518","https://openalex.org/W2893477965","https://openalex.org/W2920456178","https://openalex.org/W2964253930","https://openalex.org/W2976674591","https://openalex.org/W3130885760","https://openalex.org/W3136875090","https://openalex.org/W3168857782","https://openalex.org/W3171082807","https://openalex.org/W3179877418","https://openalex.org/W3210255049","https://openalex.org/W4312366074","https://openalex.org/W4312788538","https://openalex.org/W4323057853","https://openalex.org/W4362631162","https://openalex.org/W4386113267","https://openalex.org/W4386599968","https://openalex.org/W4388894573","https://openalex.org/W4389667009","https://openalex.org/W4391060517","https://openalex.org/W4392644266","https://openalex.org/W4392941772","https://openalex.org/W4402704535","https://openalex.org/W4402915929","https://openalex.org/W4404217576","https://openalex.org/W4409365601","https://openalex.org/W4411244563"],"related_works":[],"abstract_inverted_index":{"In":[0,33],"human-centered":[1],"environments":[2],"such":[3],"as":[4],"restaurants,":[5],"homes,":[6],"and":[7,24,74,78,80,87,99,148],"warehouses,":[8],"robots":[9],"often":[10],"face":[11],"challenges":[12,19],"in":[13,52,142],"accurately":[14],"recognizing":[15],"3D":[16,49,143,150],"objects.":[17],"These":[18],"stem":[20],"from":[21],"the":[22,58,95,125,130,139],"complexity":[23],"variability":[25],"of":[26,104],"these":[27],"environments,":[28],"including":[29],"diverse":[30],"object":[31,50,144],"shapes.":[32],"this":[34],"paper,":[35],"we":[36,120],"propose":[37],"a":[38,101,107],"novel":[39],"Lightweight":[40],"Multi-Modal":[41],"Multi-View":[42],"Convolutional-Vision":[43],"Transformer":[44],"network":[45],"(LM-MCVT)":[46],"to":[47,65,83],"enhance":[48,84],"recognition":[51,88,102,145],"robotic":[53],"applications.":[54],"Our":[55],"approach":[56],"leverages":[57],"Globally":[59],"Entropy-Based":[60],"Embeddings":[61],"Fusion":[62],"(GEEF)":[63],"method":[64,93],"integrate":[66],"multi-views":[67],"efficiently.":[68],"The":[69],"LM-MCVT":[70],"architecture":[71],"incorporates":[72],"pre-":[73],"mid-level":[75],"convolutional":[76],"encoders":[77],"local":[79],"global":[81],"transformers":[82],"feature":[85],"extraction":[86],"accuracy.":[89],"We":[90],"evaluate":[91],"our":[92],"on":[94,124],"synthetic":[96,147],"ModelNet40":[97],"dataset":[98,128],"achieve":[100],"accuracy":[103],"95.6%":[105],"using":[106,129],"four-view":[108],"setup,":[109],"surpassing":[110],"existing":[111],"state-of-the-":[112],"art":[113],"methods.":[114],"To":[115],"further":[116],"validate":[117],"its":[118],"effectiveness,":[119],"conduct":[121],"5-fold":[122],"cross-validation":[123],"real-world":[126,149],"OmniObject3D":[127],"same":[131],"configuration.":[132],"Results":[133],"consistently":[134],"show":[135],"superior":[136],"performance,":[137],"demonstrating":[138],"method\u2019s":[140],"robustness":[141],"across":[146],"data.":[151]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-05T09:01:59.212387","created_date":"2025-11-03T00:00:00"}