{"id":"https://openalex.org/W7118189490","doi":"https://doi.org/10.1109/tpami.2026.3650761","title":"Beyond LLaVA-HD: Diving Into High-Resolution Multimodal Large Language Models","display_name":"Beyond LLaVA-HD: Diving Into High-Resolution Multimodal Large Language Models","publication_year":2026,"publication_date":"2026-01-05","ids":{"openalex":"https://openalex.org/W7118189490","doi":"https://doi.org/10.1109/tpami.2026.3650761","pmid":"https://pubmed.ncbi.nlm.nih.gov/41489967"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2026.3650761","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2026.3650761","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"YiFan Zhang","orcid":"https://orcid.org/0000-0002-6227-0183"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"YiFan Zhang","raw_affiliation_strings":["State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Center for Research on Intelligent Perception and Computing (CRIPAC), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-6227-0183","affiliations":[{"raw_affiliation_string":"State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Center for Research on Intelligent Perception and Computing (CRIPAC), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121977289","display_name":"Qingsong Wen","orcid":null},"institutions":[{"id":"https://openalex.org/I174135032","display_name":"Bellevue College","ror":"https://ror.org/05gr4yv49","country_code":"US","type":"education","lineage":["https://openalex.org/I174135032"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Qingsong Wen","raw_affiliation_strings":["Squirrel Ai Learning, Bellevue, WA, USA"],"raw_orcid":"https://orcid.org/0000-0003-4516-2524","affiliations":[{"raw_affiliation_string":"Squirrel Ai Learning, Bellevue, WA, USA","institution_ids":["https://openalex.org/I174135032"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014172220","display_name":"Chaoyou Fu","orcid":"https://orcid.org/0000-0002-0079-7668"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chaoyou Fu","raw_affiliation_strings":["Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0002-0079-7668","affiliations":[{"raw_affiliation_string":"Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121926855","display_name":"Kun Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I174135032","display_name":"Bellevue College","ror":"https://ror.org/05gr4yv49","country_code":"US","type":"education","lineage":["https://openalex.org/I174135032"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kun Wang","raw_affiliation_strings":["Squirrel Ai Learning, Bellevue, WA, USA"],"raw_orcid":"https://orcid.org/0000-0003-0602-169X","affiliations":[{"raw_affiliation_string":"Squirrel Ai Learning, Bellevue, WA, USA","institution_ids":["https://openalex.org/I174135032"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101440776","display_name":"Xue Wang","orcid":"https://orcid.org/0009-0004-2296-9688"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]},{"id":"https://openalex.org/I4210108985","display_name":"Bellevue Hospital Center","ror":"https://ror.org/01ky34z31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I1283621791","https://openalex.org/I4210086933","https://openalex.org/I4210108985"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xue Wang","raw_affiliation_strings":["Alibaba Group, Bellevue, WA, USA"],"raw_orcid":"https://orcid.org/0009-0004-2296-9688","affiliations":[{"raw_affiliation_string":"Alibaba Group, Bellevue, WA, USA","institution_ids":["https://openalex.org/I4210095624","https://openalex.org/I4210108985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121913403","display_name":"Zhang Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhang Zhang","raw_affiliation_strings":["State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Center for Research on Intelligent Perception and Computing (CRIPAC), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-9425-3065","affiliations":[{"raw_affiliation_string":"State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Center for Research on Intelligent Perception and Computing (CRIPAC), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121922035","display_name":"Liang Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang Wang","raw_affiliation_strings":["State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Center for Research on Intelligent Perception and Computing (CRIPAC), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5224-8647","affiliations":[{"raw_affiliation_string":"State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Center for Research on Intelligent Perception and Computing (CRIPAC), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5121972161","display_name":"Rong Jin","orcid":null},"institutions":[{"id":"https://openalex.org/I4210099336","display_name":"Menlo School","ror":"https://ror.org/01240pn49","country_code":"US","type":"education","lineage":["https://openalex.org/I4210099336"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rong Jin","raw_affiliation_strings":["Meta, Menlo Park, CA, USA"],"raw_orcid":"https://orcid.org/0000-0002-8797-4646","affiliations":[{"raw_affiliation_string":"Meta, Menlo Park, CA, USA","institution_ids":["https://openalex.org/I4210099336"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210112150"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.02188686,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"48","issue":"5","first_page":"5493","last_page":"5504"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6802999973297119,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6802999973297119,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.09539999812841415,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.059700001031160355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.6215999722480774},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5454999804496765},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.5101000070571899},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4291999936103821},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.397599995136261},{"id":"https://openalex.org/keywords/image-compression","display_name":"Image compression","score":0.35420000553131104},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3375999927520752},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3312000036239624}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.789900004863739},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.6215999722480774},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.588100016117096},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5454999804496765},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.5101000070571899},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4291999936103821},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4221000075340271},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.397599995136261},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.35589998960494995},{"id":"https://openalex.org/C13481523","wikidata":"https://www.wikidata.org/wiki/Q412438","display_name":"Image compression","level":4,"score":0.35420000553131104},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3375999927520752},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3312000036239624},{"id":"https://openalex.org/C205372480","wikidata":"https://www.wikidata.org/wiki/Q210521","display_name":"Image resolution","level":2,"score":0.3296000063419342},{"id":"https://openalex.org/C138268822","wikidata":"https://www.wikidata.org/wiki/Q1051925","display_name":"Resolution (logic)","level":2,"score":0.3287000060081482},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.290800005197525},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.28679999709129333},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.2632000148296356},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26249998807907104},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.2563999891281128}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2026.3650761","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2026.3650761","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:41489967","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41489967","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1627056226","display_name":null,"funder_award_id":"62373355","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4127089600","display_name":null,"funder_award_id":"62141608","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5156147101","display_name":null,"funder_award_id":"62236010","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7461287619","display_name":null,"funder_award_id":"L252033","funder_id":"https://openalex.org/F4320322919","funder_display_name":"Natural Science Foundation of Beijing Municipality"},{"id":"https://openalex.org/G8053739178","display_name":null,"funder_award_id":"62322607","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322919","display_name":"Natural Science Foundation of Beijing Municipality","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Seeing":[0],"clearly":[1],"with":[2,44,236,255,273],"high":[3,237],"resolution":[4,32,57,65],"is":[5,170],"a":[6,30,98,118,161,168,187,233],"foundation":[7],"of":[8,39,79,120,199,245,263],"Multimodal":[9],"Large":[10],"Language":[11],"Models":[12],"(MLLMs),":[13],"which":[14],"has":[15],"been":[16],"proven":[17],"to":[18,54,135,143,152,183],"be":[19,157],"vital":[20],"for":[21,216,239],"visual":[22],"perception":[23],"and":[24,41,75,96,204,226,261],"reasoning.":[25],"Existing":[26],"works":[27],"usually":[28],"employ":[29],"straightforward":[31],"upscaling":[33],"method,":[34,252],"where":[35,173],"the":[36,45,48,55,59,77,85,94,114,124,147,153,192,200,243,246],"image":[37,50,81,145,180,240,259],"consists":[38],"global":[40,86,115,201,225,264],"local":[42,68,80,136,179,205,227,247],"branches,":[43],"latter":[46],"being":[47],"sliced":[49],"patches":[51],"but":[52,176],"resized":[53],"same":[56],"as":[58,101,103,195],"former.":[60],"This":[61],"means":[62],"that":[63,126],"higher":[64],"requires":[66],"more":[67,177],"patches,":[69,137],"resulting":[70],"in":[71,191],"exorbitant":[72],"computational":[73],"expenses,":[74],"meanwhile,":[76],"dominance":[78],"tokens":[82,149,181],"may":[83],"diminish":[84],"context.":[87],"In":[88],"this":[89],"paper,":[90],"we":[91,109,230],"dive":[92],"into":[93],"problems":[95],"propose":[97],"new":[99],"framework":[100],"well":[102],"an":[104,217],"elaborate":[105],"optimization":[106],"strategy.":[107],"Specifically,":[108],"extract":[110],"contextual":[111],"information":[112],"from":[113],"view":[116],"using":[117],"mixture":[119],"adapters,":[121],"based":[122],"on":[123],"observation":[125],"different":[127,131],"adapters":[128],"excel":[129],"at":[130],"tasks.":[132],"With":[133],"regard":[134],"learnable":[138],"query":[139],"embeddings":[140],"are":[141],"introduced":[142],"reduce":[144],"tokens,":[146],"important":[148],"most":[150],"relevant":[151],"user":[154],"question":[155],"will":[156],"further":[158],"selected":[159],"by":[160],"similarity-based":[162],"selector.":[163],"Our":[164],"empirical":[165],"results":[166],"demonstrate":[167],"'less":[169],"more'":[171],"pattern,":[172],"utilizing":[174],"fewer":[175],"informative":[178],"leads":[182],"improved":[184],"performance.":[185],"Besides,":[186],"significant":[188],"challenge":[189],"lies":[190],"training":[193,198,219,244,277],"strategy,":[194],"simultaneous":[196],"end-to-end":[197],"mining":[202],"block":[203,207],"compression":[206,248],"does":[208],"not":[209],"yield":[210],"optimal":[211],"results.":[212],"We":[213],"thus":[214],"advocate":[215],"alternating":[218],"way,":[220],"ensuring":[221],"balanced":[222],"learning":[223],"between":[224],"aspects.":[228],"Finally,":[229],"also":[231],"introduce":[232],"challenging":[234],"dataset":[235],"requirements":[238],"detail,":[241],"enhancing":[242],"layer.":[249],"The":[250],"proposed":[251],"termed":[253],"MLLM":[254],"Sophisticated":[256],"Tasks,":[257],"Local":[258],"compression,":[260],"Mixture":[262],"Experts":[265],"(SliME),":[266],"achieves":[267],"leading":[268],"performance":[269],"across":[270],"various":[271],"benchmarks":[272],"only":[274],"2":[275],"million":[276],"data.":[278]},"counts_by_year":[],"updated_date":"2026-04-04T06:10:10.580331","created_date":"2026-01-05T00:00:00"}
