{"id":"https://openalex.org/W4409129939","doi":"https://doi.org/10.1109/tmm.2025.3557703","title":"Multi-Modal Self-Perception Enhanced Large Language Model for 3D Region-of-Interest Captioning With Limited Data","display_name":"Multi-Modal Self-Perception Enhanced Large Language Model for 3D Region-of-Interest Captioning With Limited Data","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4409129939","doi":"https://doi.org/10.1109/tmm.2025.3557703"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2025.3557703","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3557703","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100931299","display_name":"Lu Shi","orcid":"https://orcid.org/0000-0003-3203-630X"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lu Shi","raw_affiliation_strings":["State Key Laboratory of Advanced Rail Autonomous Operation, the School of Computer Science and Technology, and Visual Intellgence +X International Cooperation Joint Laboratory of MOE, Bejing Jiaotong University, Beijing, China","Institute of Information Science, Bejing Jiaotong University and the Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Advanced Rail Autonomous Operation, the School of Computer Science and Technology, and Visual Intellgence +X International Cooperation Joint Laboratory of MOE, Bejing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]},{"raw_affiliation_string":"Institute of Information Science, Bejing Jiaotong University and the Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025341442","display_name":"Shichao Kan","orcid":"https://orcid.org/0000-0003-0097-6196"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shichao Kan","raw_affiliation_strings":["School of Computer Science and Engineering, Central South University, Changsha, China","School of Computer Science and Engineering, Central South University, Changsha, Hunan, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, China","institution_ids":["https://openalex.org/I139660479"]},{"raw_affiliation_string":"School of Computer Science and Engineering, Central South University, Changsha, Hunan, China","institution_ids":["https://openalex.org/I139660479"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033896100","display_name":"Yi Jin","orcid":"https://orcid.org/0000-0001-8408-3816"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Jin","raw_affiliation_strings":["Key Laboratory of Big Data and Artificial Intelligence in Transportation, Ministry of Education and the School of Computer Science and Technology, Beijing Jiaotong University, Beijing, China","Key Laboratory of Big Data and Artificial Intelligence in Transportation, Ministry of Education and the School of Computer and lnformation Technology, Beijing Jiaotong University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Big Data and Artificial Intelligence in Transportation, Ministry of Education and the School of Computer Science and Technology, Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]},{"raw_affiliation_string":"Key Laboratory of Big Data and Artificial Intelligence in Transportation, Ministry of Education and the School of Computer and lnformation Technology, Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Linna Zhang","orcid":"https://orcid.org/0000-0001-6255-9422"},"institutions":[{"id":"https://openalex.org/I178232147","display_name":"Guizhou University","ror":"https://ror.org/02wmsc916","country_code":"CN","type":"education","lineage":["https://openalex.org/I178232147"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Linna Zhang","raw_affiliation_strings":["School of Mechanical Engineering, Guizhou University, Guiyang, China"],"affiliations":[{"raw_affiliation_string":"School of Mechanical Engineering, Guizhou University, Guiyang, China","institution_ids":["https://openalex.org/I178232147"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5082462498","display_name":"Yigang Cen","orcid":"https://orcid.org/0000-0001-6255-9422"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yigang Cen","raw_affiliation_strings":["State Key Laboratory of Advanced Rail Autonomous Operation, the School of Computer Science and Technology, and Visual Intellgence +X International Cooperation Joint Laboratory of MOE, Bejing Jiaotong University, Beijing, China","Institute of Information Science, Bejing Jiaotong University and the Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Advanced Rail Autonomous Operation, the School of Computer Science and Technology, and Visual Intellgence +X International Cooperation Joint Laboratory of MOE, Bejing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]},{"raw_affiliation_string":"Institute of Information Science, Bejing Jiaotong University and the Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100931299"],"corresponding_institution_ids":["https://openalex.org/I21193070"],"apc_list":null,"apc_paid":null,"fwci":3.6542,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.92418318,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":"27","issue":null,"first_page":"2935","last_page":"2948"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.986299991607666,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9818000197410583,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8361499309539795},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.7827916145324707},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5921429395675659},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5914292335510254},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.5100939869880676},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4496798515319824},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.44700998067855835},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.43837568163871765},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4222731292247772},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.1209283173084259},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.0868491530418396}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8361499309539795},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.7827916145324707},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5921429395675659},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5914292335510254},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.5100939869880676},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4496798515319824},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44700998067855835},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.43837568163871765},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4222731292247772},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.1209283173084259},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0868491530418396},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3557703","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3557703","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5299999713897705,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G3644680343","display_name":null,"funder_award_id":"62463002","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G386497372","display_name":null,"funder_award_id":"62062021","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G53838021","display_name":null,"funder_award_id":"62473033","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8222222245","display_name":null,"funder_award_id":"62202499","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W2166821106","https://openalex.org/W2277195237","https://openalex.org/W2594519801","https://openalex.org/W2963109634","https://openalex.org/W2970641574","https://openalex.org/W3095974555","https://openalex.org/W3096609285","https://openalex.org/W3107521863","https://openalex.org/W3156636935","https://openalex.org/W3176641147","https://openalex.org/W4312270234","https://openalex.org/W4312852845","https://openalex.org/W4376466999","https://openalex.org/W4385731905","https://openalex.org/W4385945203","https://openalex.org/W4386071618","https://openalex.org/W4386076628","https://openalex.org/W4386918784","https://openalex.org/W4388543957","https://openalex.org/W4390872495","https://openalex.org/W4393149524","https://openalex.org/W4394773594","https://openalex.org/W4399310957","https://openalex.org/W4400647053","https://openalex.org/W4401991200","https://openalex.org/W4402716423","https://openalex.org/W4402753807","https://openalex.org/W4403778769","https://openalex.org/W4410706693","https://openalex.org/W6757817989","https://openalex.org/W6796581206","https://openalex.org/W6811340617","https://openalex.org/W6846007759","https://openalex.org/W6849177959","https://openalex.org/W6850625674","https://openalex.org/W6851592950","https://openalex.org/W6853264062","https://openalex.org/W6854262950","https://openalex.org/W6854507508","https://openalex.org/W6855425132","https://openalex.org/W6856328902","https://openalex.org/W6867259787"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W2963177403","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4283207562","https://openalex.org/W2330246314","https://openalex.org/W2949522393","https://openalex.org/W4289422896"],"abstract_inverted_index":{"3D":[0,15,78,147,166,179,216],"Region-of-Interest":[1],"(RoI)":[2],"Captioning":[3],"involves":[4],"translating":[5],"a":[6,13,69,98,105,136,152,184],"model's":[7],"understanding":[8],"of":[9,116,121,146,193],"specific":[10],"objects":[11],"within":[12],"complex":[14],"scene":[16],"into":[17,229],"descriptive":[18],"captions.":[19,62],"Recent":[20],"advancements":[21],"in":[22,31],"Large":[23,70],"Language":[24,71],"Models":[25],"(LLMs)":[26],"have":[27],"shown":[28],"great":[29],"potential":[30],"this":[32,48,64],"area.":[33],"Existing":[34],"methods":[35],"capture":[36],"the":[37,130,143,161,173,191,194,221],"visual":[38,132],"information":[39,55,112,125,197],"from":[40,118],"RoIs":[41],"as":[42],"input":[43],"tokens":[44],"for":[45,56,77,140,233,242],"LLMs.":[46,122,141],"However,":[47],"approach":[49],"may":[50],"not":[51],"provide":[52],"enough":[53],"detailed":[54],"LLMs":[57,83,232],"to":[58,96,109,134,156,189,203],"generate":[59,110],"accurate":[60,205],"region-specific":[61],"In":[63,160,172],"paper,":[65],"we":[66,150,164,176],"introduce":[67],"Self-RoI,":[68],"Model":[72],"with":[73,129],"multi-modal":[74,99,138,231],"self-perception":[75],"capabilities":[76],"RoI":[79,117,167,180,217],"captioning.":[80],"To":[81],"ensure":[82,204],"receive":[84],"more":[85],"precise":[86],"and":[87,169,198],"sufficient":[88],"information,":[89],"Self-RoI":[90,158,212],"incorporates":[91],"Implicit":[92,222],"Textual":[93,223],"Info.":[94,224],"Perception":[95,225],"construct":[97],"vision-language":[100],"information.":[101],"This":[102,123],"module":[103,188],"utilizes":[104],"simple":[106],"mapping":[107],"network":[108],"textual":[111,124,196],"about":[113],"basic":[114],"properties":[115],"vision-following":[119],"response":[120],"is":[126],"then":[127],"integrated":[128,228],"RoI's":[131],"representation":[133],"form":[135],"comprehensive":[137],"instruction":[139],"Given":[142],"limited":[144],"availability":[145],"RoI-captioning":[148],"data,":[149],"propose":[151],"two-stage":[153],"training":[154],"strategy":[155],"optimize":[157],"efficiently.":[159],"first":[162],"stage,":[163,175],"align":[165],"vision":[168],"caption":[170,206],"representations.":[171],"second":[174],"focus":[177],"on":[178],"vision-caption":[181],"interaction,":[182],"using":[183],"disparate":[185],"contrastive":[186],"embedding":[187],"improve":[190],"reliability":[192],"implicit":[195],"employing":[199],"language":[200],"modeling":[201],"loss":[202],"generation.":[207],"Our":[208],"experiments":[209],"demonstrate":[210],"that":[211],"significantly":[213],"outperforms":[214],"previous":[215],"captioning":[218],"models.":[219],"Moreover,":[220],"can":[226],"be":[227],"other":[230],"performance":[234],"enhancement.":[235],"We":[236],"will":[237],"make":[238],"our":[239],"code":[240],"available":[241],"further":[243],"research.":[244]},"counts_by_year":[{"year":2025,"cited_by_count":3}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
