{"id":"https://openalex.org/W4417439096","doi":"https://doi.org/10.1109/tpami.2025.3644851","title":"Boosting Multi-Modal Large Language Model With Enhanced Visual Features","display_name":"Boosting Multi-Modal Large Language Model With Enhanced Visual Features","publication_year":2025,"publication_date":"2025-12-17","ids":{"openalex":"https://openalex.org/W4417439096","doi":"https://doi.org/10.1109/tpami.2025.3644851","pmid":"https://pubmed.ncbi.nlm.nih.gov/41406256"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2025.3644851","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3644851","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073144816","display_name":"Yiwei Ma","orcid":"https://orcid.org/0000-0002-8744-3423"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yiwei Ma","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]},{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111335242","display_name":"Weihuang Lin","orcid":"https://orcid.org/0009-0006-3807-6969"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weihuang Lin","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]},{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073622954","display_name":"Zhibin Wang","orcid":"https://orcid.org/0000-0001-7618-7973"},"institutions":[{"id":"https://openalex.org/I4210111607","display_name":"InferVision (China)","ror":"https://ror.org/027h3dg90","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210111607"]},{"id":"https://openalex.org/I4210148110","display_name":"Tech Pro (China)","ror":"https://ror.org/04rttan30","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210148110"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhibin Wang","raw_affiliation_strings":["Inf Tech Company, Beijing, China","Inf Tech Company, China"],"affiliations":[{"raw_affiliation_string":"Inf Tech Company, Beijing, China","institution_ids":["https://openalex.org/I4210111607"]},{"raw_affiliation_string":"Inf Tech Company, China","institution_ids":["https://openalex.org/I4210148110"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084510895","display_name":"Jiayi Ji","orcid":"https://orcid.org/0000-0002-9956-6308"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiayi Ji","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]},{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059926864","display_name":"Xiaoshuai Sun","orcid":"https://orcid.org/0000-0003-3912-9306"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoshuai Sun","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]},{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100403129","display_name":"Weisi Lin","orcid":"https://orcid.org/0000-0001-9866-1947"},"institutions":[{"id":"https://openalex.org/I25846049","display_name":"National Tsing Hua University","ror":"https://ror.org/00zdnkx70","country_code":"TW","type":"education","lineage":["https://openalex.org/I25846049"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Chia-Wen Lin","raw_affiliation_strings":["Department of Electrical Engineering, National Tsing Hua University, Hsinchu, Taiwan","Department of Electrical Engineering, National Tsing Hua University, Taiwan"],"affiliations":[{"raw_affiliation_string":"Department of Electrical Engineering, National Tsing Hua University, Hsinchu, Taiwan","institution_ids":["https://openalex.org/I25846049"]},{"raw_affiliation_string":"Department of Electrical Engineering, National Tsing Hua University, Taiwan","institution_ids":["https://openalex.org/I25846049"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016080094","display_name":"Rongrong Ji","orcid":"https://orcid.org/0000-0001-9163-2932"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rongrong Ji","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]},{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, China","institution_ids":["https://openalex.org/I191208505"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5073144816"],"corresponding_institution_ids":["https://openalex.org/I191208505"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.44208636,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"48","issue":"4","first_page":"4524","last_page":"4538"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9699000120162964,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9699000120162964,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.005400000140070915,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.003599999938160181,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.691100001335144},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6715999841690063},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.4740000069141388},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.4514000117778778},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.39959999918937683},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.38530001044273376},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.37720000743865967},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.35429999232292175}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8212000131607056},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.691100001335144},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6715999841690063},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6333000063896179},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.4740000069141388},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4729999899864197},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.4514000117778778},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.39959999918937683},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.38530001044273376},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.37720000743865967},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.35429999232292175},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3384000062942505},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.30720001459121704},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2971999943256378},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2897999882698059},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.27469998598098755},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2669000029563904},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2515000104904175}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2025.3644851","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3644851","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:41406256","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41406256","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W2090048052","https://openalex.org/W2307512708","https://openalex.org/W2565639579","https://openalex.org/W2567001798","https://openalex.org/W2752782242","https://openalex.org/W2896457183","https://openalex.org/W2963518342","https://openalex.org/W2979382951","https://openalex.org/W2983167311","https://openalex.org/W3120043490","https://openalex.org/W3144293453","https://openalex.org/W3167939936","https://openalex.org/W4285255856","https://openalex.org/W4313047887","https://openalex.org/W4385245566","https://openalex.org/W4389523832","https://openalex.org/W4390873312","https://openalex.org/W4390874575","https://openalex.org/W4402713111","https://openalex.org/W4402716330","https://openalex.org/W4402716381","https://openalex.org/W4402716477","https://openalex.org/W4402726948","https://openalex.org/W4402727764","https://openalex.org/W4402727885","https://openalex.org/W4402754023","https://openalex.org/W4403002096","https://openalex.org/W4403081466","https://openalex.org/W4403842540","https://openalex.org/W4404199570","https://openalex.org/W4404575065","https://openalex.org/W4406657538","https://openalex.org/W4409365911","https://openalex.org/W4411725552","https://openalex.org/W4415795395","https://openalex.org/W4416707474","https://openalex.org/W7114915739","https://openalex.org/W7133196460"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,15,63,76,197,217],"computer":[3],"vision":[4,120,174],"(CV)":[5],"and":[6,26,32,45,80,105,108,127,143,179,214],"large":[7,17],"language":[8,18],"models":[9,19],"(LLMs)":[10],"have":[11],"spurred":[12],"significant":[13,191],"interest":[14],"multi-modal":[16],"(MLLMs),":[20],"which":[21],"aim":[22],"to":[23,47,87,155],"integrate":[24],"visual":[25,61,93,134,139,202,211],"textual":[27],"modalities":[28],"for":[29,222],"enhanced":[30],"understanding":[31],"generation":[33],"tasks.":[34],"While":[35],"much":[36],"of":[37,60,92,182,201,209],"the":[38,57,90,100,106,119,133,199,207,220],"existing":[39],"research":[40],"focuses":[41],"on":[42,164],"optimizing":[43,210],"projectors":[44],"LLMs":[46],"improve":[48],"MLLM":[49,78,85],"performance,":[50],"a":[51,83],"critical":[52],"question":[53],"remains":[54],"underexplored:":[55],"Has":[56],"full":[58],"potential":[59,200],"features":[62,117,140],"MLLMs":[64],"been":[65],"realized?":[66],"To":[67],"address":[68],"this":[69],"question,":[70],"we":[71],"identify":[72],"two":[73,97],"key":[74],"limitations":[75],"current":[77],"architectures":[79],"propose":[81],"vMLLM,":[82],"vision-enhanced":[84],"designed":[86],"fully":[88],"leverage":[89],"capabilities":[91],"features.":[94,203],"vMLLM":[95,168,188],"introduces":[96],"novel":[98],"components:":[99],"Multi-level":[101],"Aggregation":[102],"Module":[103,111],"(MAM)":[104],"Intra-":[107],"inter-modal":[109,144],"Enhancement":[110],"(IEM).":[112],"The":[113,136],"MAM":[114],"aggregates":[115],"multi-layer":[116],"from":[118],"encoder,":[121],"capturing":[122],"both":[123],"high-level":[124],"semantic":[125],"information":[126,149],"low-level":[128],"spatial":[129],"details,":[130],"thereby":[131],"enriching":[132],"representation.":[135],"IEM":[137],"enhances":[138],"through":[141],"intra-":[142],"interactions,":[145],"effectively":[146],"suppressing":[147],"irrelevant":[148],"while":[150],"amplifying":[151],"task-relevant":[152],"features,":[153],"leading":[154],"more":[156,223],"robust":[157],"multimodal":[158,225],"understanding.":[159],"We":[160],"conduct":[161],"extensive":[162],"experiments":[163],"multiple":[165],"benchmarks,":[166],"evaluating":[167],"across":[169],"diverse":[170],"settings,":[171],"including":[172],"different":[173],"encoders,":[175],"training":[176],"dataset":[177],"scales,":[178],"varying":[180],"sizes":[181],"LLMs.":[183],"Our":[184],"results":[185],"demonstrate":[186],"that":[187],"consistently":[189],"achieves":[190],"performance":[192],"improvements,":[193],"validating":[194],"its":[195],"effectiveness":[196],"harnessing":[198],"These":[204],"findings":[205],"highlight":[206],"importance":[208],"feature":[212],"extraction":[213],"interaction":[215],"mechanisms":[216],"MLLMs,":[218],"paving":[219],"way":[221],"advanced":[224],"AI":[226],"systems..":[227]},"counts_by_year":[],"updated_date":"2026-03-09T07:00:12.390032","created_date":"2025-12-17T00:00:00"}
