{"id":"https://openalex.org/W4416707474","doi":"https://doi.org/10.1109/tpami.2025.3637265","title":"Mini-Gemini: Mining the Potential of Multi-Modality Vision Language Models","display_name":"Mini-Gemini: Mining the Potential of Multi-Modality Vision Language Models","publication_year":2025,"publication_date":"2025-11-26","ids":{"openalex":"https://openalex.org/W4416707474","doi":"https://doi.org/10.1109/tpami.2025.3637265","pmid":"https://pubmed.ncbi.nlm.nih.gov/41296951"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2025.3637265","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3637265","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100727824","display_name":"Yanwei Li","orcid":"https://orcid.org/0000-0002-2736-132X"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Yanwei Li","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009874821","display_name":"Yuechen Zhang","orcid":"https://orcid.org/0009-0000-9112-0216"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yuechen Zhang","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103090275","display_name":"Chengyao Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Chengyao Wang","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhisheng Zhong","orcid":"https://orcid.org/0009-0002-7831-8636"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Zhisheng Zhong","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100393444","display_name":"Yixin Chen","orcid":"https://orcid.org/0000-0002-2939-2541"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yixin Chen","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034923822","display_name":"Ruihang Chu","orcid":"https://orcid.org/0000-0001-9057-745X"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Ruihang Chu","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083783069","display_name":"Shaoteng Liu","orcid":"https://orcid.org/0000-0001-5407-2905"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Shaoteng Liu","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kon","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052856441","display_name":"Jiaya Jia","orcid":"https://orcid.org/0000-0002-1246-553X"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]},{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Jiaya Jia","raw_affiliation_strings":["Department of Computer Science and Engineering, The Hong Kong University of Science and Technology, Hong Kong","Department of Computer Science and Engineering, The Hong Kong University of Science and Technology, Hong Kon"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Hong Kong University of Science and Technology, Hong Kong","institution_ids":["https://openalex.org/I200769079"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, The Hong Kong University of Science and Technology, Hong Kon","institution_ids":["https://openalex.org/I200769079","https://openalex.org/I889458895"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5100727824"],"corresponding_institution_ids":["https://openalex.org/I177725633"],"apc_list":null,"apc_paid":null,"fwci":19.4892,"has_fulltext":false,"cited_by_count":16,"citation_normalized_percentile":{"value":0.99470856,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"48","issue":"3","first_page":"3530","last_page":"3543"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9922999739646912,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9922999739646912,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.001500000013038516,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.000699999975040555,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6146000027656555},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5742999911308289},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5620999932289124},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.49380001425743103},{"id":"https://openalex.org/keywords/scope","display_name":"Scope (computer science)","score":0.47780001163482666},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4672999978065491},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.46720001101493835},{"id":"https://openalex.org/keywords/dialog-box","display_name":"Dialog box","score":0.4643000066280365},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.400299996137619}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8500000238418579},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6348000168800354},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6146000027656555},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5742999911308289},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5620999932289124},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.49380001425743103},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.48500001430511475},{"id":"https://openalex.org/C2778012447","wikidata":"https://www.wikidata.org/wiki/Q1034415","display_name":"Scope (computer science)","level":2,"score":0.47780001163482666},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4672999978065491},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.46720001101493835},{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.4643000066280365},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.400299996137619},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3968999981880188},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3808000087738037},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3560999929904938},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.3449000120162964},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.320499986410141},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.31610000133514404},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3091999888420105},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.30480000376701355},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.2985999882221222},{"id":"https://openalex.org/C190954187","wikidata":"https://www.wikidata.org/wiki/Q5270587","display_name":"Dialog system","level":3,"score":0.28049999475479126},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2741999924182892},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2572000026702881},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2565999925136566},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2551000118255615},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.25290000438690186},{"id":"https://openalex.org/C105842133","wikidata":"https://www.wikidata.org/wiki/Q1899679","display_name":"Visual communication","level":2,"score":0.25049999356269836},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.25049999356269836}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/tpami.2025.3637265","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3637265","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:41296951","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41296951","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-167179","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-167179","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0,109,153],"this":[1],"work,":[2],"we":[3,115],"introduce":[4],"Mini-Gemini,":[5],"a":[6,28,42,135,177],"simple":[7],"and":[8,26,38,86,103,143,162,170,181,199,221,235,254],"effective":[9],"framework":[10],"enhancing":[11],"multi-modality":[12],"Vision":[13],"Language":[14,184],"Models":[15,185],"(VLMs).":[16],"Despite":[17],"the":[18,47,51,64,78,98,128,147,158,202,214,238],"advancements":[19],"in":[20,195,231,256],"VLMs":[21,54,102,161],"facilitating":[22],"basic":[23],"visual":[24,71,113,121,129,252],"dialog":[25],"reasoning,":[27,169],"performance":[29,57,194,230],"gap":[30,48,99],"persists":[31],"compared":[32],"to":[33,45,83,96,111,117,189,209,246],"advanced":[34],"models":[35,105],"like":[36],"GPT-4":[37],"Gemini.":[39],"We":[40,132],"propose":[41,116],"novel":[43],"approach":[44],"narrow":[46],"by":[49],"mining":[50],"potential":[52,159],"of":[53,89,150,160,179],"for":[55,123],"better":[56],"across":[58],"various":[59],"cross-modal":[60],"tasks.":[61],"It":[62,206],"tackles":[63],"following":[65],"questions:":[66],"(1)":[67],"How":[68,82,95],"can":[69],"high-resolution":[70,124],"tokens":[72],"improve":[73,84,247],"image":[74,141,167],"understanding":[75],"without":[76,126],"lengthening":[77],"token":[79,130],"sequence?":[80],"(2)":[81],"reasoning":[85],"generation":[87,171],"abilities":[88],"VLM":[90],"with":[91,166,249],"high-quality":[92,136],"data?":[93],"(3)":[94],"close":[97],"between":[100],"open-source":[101],"proprietary":[104],"on":[106,213,223],"reasoning-driven":[107],"generation?":[108],"particular,":[110],"enhance":[112],"tokens,":[114],"utilize":[118],"an":[119],"additional":[120],"encoder":[122],"refinement":[125],"increasing":[127],"count.":[131],"further":[133,156],"construct":[134],"dataset":[137],"that":[138],"promotes":[139],"precise":[140],"comprehension":[142],"reasoning-based":[144],"generation,":[145],"expanding":[146],"operational":[148],"scope":[149],"current":[151,164],"VLMs.":[152],"general,":[154],"Mini-Gemini":[155,243],"mines":[157],"empowers":[163],"frameworks":[165],"understanding,":[168],"simultaneously.":[172],"The":[173],"proposed":[174],"model":[175],"supports":[176],"series":[178],"dense":[180],"MoE":[182],"Large":[183],"(LLMs)":[186],"from":[187],"2B":[188],"34B,":[190],"which":[191],"achieve":[192],"leading":[193,229],"several":[196,232],"zero-shot":[197,233],"benchmarks":[198,234],"even":[200,236],"surpasses":[201,237],"developed":[203,239],"private":[204,240],"models.":[205,241],"is":[207,244],"demonstrated":[208],"attain":[210],"80.6%":[211],"accuracy":[212],"MMB":[215],"benchmark":[216],"(+5.4":[217],"vs":[218,226],"Gemini":[219],"Pro)":[220],"74.1%":[222],"TextVQA":[224],"(+4.6":[225],"LLaVA-NeXT),":[227],"achieving":[228],"Furthermore,":[242],"proven":[245],"consistently":[248],"stronger":[250],"LLM,":[251],"encoder,":[253],"data":[255],"experiments.":[257]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":12}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-11-27T00:00:00"}
