{"id":"https://openalex.org/W7134227183","doi":"https://doi.org/10.48550/arxiv.2603.06569","title":"Penguin-VL: Exploring the Efficiency Limits of VLM with LLM-based Vision Encoders","display_name":"Penguin-VL: Exploring the Efficiency Limits of VLM with LLM-based Vision Encoders","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7134227183","doi":"https://doi.org/10.48550/arxiv.2603.06569"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.06569","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128401817","display_name":"Boqiang Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Boqiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128546632","display_name":"Lei Ke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ke, Lei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128458048","display_name":"Ruihan Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Ruihan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128418748","display_name":"Qi Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Qi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113084792","display_name":"Tianyuan Qu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qu, Tianyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128548404","display_name":"Rossell Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Rossell","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128424997","display_name":"Dong Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Dong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128428579","display_name":"Leoweiliang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leoweiliang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5128401817"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9818999767303467,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9818999767303467,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0024999999441206455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.002099999925121665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.7124999761581421},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6805999875068665},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5095000267028809},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.49410000443458557},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.448199987411499},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.41449999809265137},{"id":"https://openalex.org/keywords/enhanced-data-rates-for-gsm-evolution","display_name":"Enhanced Data Rates for GSM Evolution","score":0.37470000982284546},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.361299991607666},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.3549000024795532}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7799000144004822},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.7124999761581421},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6805999875068665},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6241000294685364},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6139000058174133},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5095000267028809},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.49410000443458557},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.448199987411499},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.41449999809265137},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.37470000982284546},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.361299991607666},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3549000024795532},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.34880000352859497},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.3303999900817871},{"id":"https://openalex.org/C186967261","wikidata":"https://www.wikidata.org/wiki/Q5082128","display_name":"Mobile device","level":2,"score":0.3294999897480011},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.3124000132083893},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.3095000088214874},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.30480000376701355},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.2989000082015991},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2800999879837036},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.27880001068115234},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.25870001316070557},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.2565000057220459},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.2563999891281128},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.06569","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.06569","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.06569","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.06569","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.4753120243549347,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"},{"score":0.47523796558380127,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision":[0],"Language":[1],"Model":[2],"(VLM)":[3],"development":[4],"has":[5],"largely":[6],"relied":[7],"on":[8,15,51],"scaling":[9,182],"model":[10,181],"size,":[11],"which":[12],"hinders":[13],"deployment":[14],"compute-constrained":[16],"mobile":[17],"and":[18,24,38,73,85,126,135,150,161,201,210,223],"edge":[19],"devices":[20],"such":[21,155],"as":[22,111,156],"smartphones":[23],"robots.":[25],"In":[26],"this":[27,91],"work,":[28],"we":[29,93],"explore":[30],"the":[31,43,184],"performance":[32,140,226],"limits":[33],"of":[34,123,187],"compact":[35],"(e.g.,":[36,59,145],"2B":[37],"8B)":[39],"VLMs.":[40],"We":[41,61],"challenge":[42],"prevailing":[44],"practice":[45],"that":[46,76,108,175,192,204],"state-of-the-art":[47],"VLMs":[48,144,222],"must":[49],"rely":[50],"vision":[52,97],"encoders":[53],"initialized":[54,100],"via":[55],"massive":[56],"contrastive":[57,66,117],"pretraining":[58],"CLIP/SigLIP).":[60],"identify":[62],"an":[63],"objective":[64],"mismatch:":[65],"learning,":[67],"optimized":[68],"for":[69,82,129,207,220],"discrimination,":[70],"enforces":[71],"coarse":[72],"category-level":[74],"invariances":[75],"suppress":[77],"fine-grained":[78,199],"visual":[79,124,159,177],"cues":[80,203],"needed":[81],"dense":[83,208],"captioning":[84],"complex":[86,211],"VLM":[87],"reasoning.":[88,212],"To":[89],"address":[90],"issue,":[92],"present":[94],"Penguin-VL,":[95],"whose":[96],"encoder":[98],"is":[99,183],"from":[101],"a":[102,112,120,171,216],"text-only":[103],"LLM.":[104],"Our":[105,189],"experiments":[106],"reveal":[107],"Penguin-Encoder":[109,193],"serves":[110],"superior":[113],"alternative":[114,219],"to":[115,142],"traditional":[116],"pretraining,":[118],"unlocking":[119],"higher":[121],"degree":[122],"fidelity":[125],"data":[127],"efficiency":[128],"multimodal":[130],"understanding.":[131,164],"Across":[132],"various":[133],"image":[134],"video":[136,163],"benchmarks,":[137],"Penguin-VL":[138],"achieves":[139],"comparable":[141],"leading":[143],"Qwen3-VL)":[146],"in":[147,153,227],"mathematical":[148],"reasoning":[149],"surpasses":[151],"them":[152],"tasks":[154],"document":[157],"understanding,":[158],"knowledge,":[160],"multi-perspective":[162],"Notably,":[165],"these":[166],"gains":[167],"are":[168,205],"achieved":[169],"with":[170],"lightweight":[172],"architecture,":[173],"demonstrating":[174],"improved":[176],"representation":[178],"rather":[179],"than":[180],"primary":[185],"driver":[186],"performance.":[188],"ablations":[190],"show":[191],"consistently":[194],"outperforms":[195],"contrastive-pretrained":[196],"encoders,":[197],"preserving":[198],"spatial":[200],"temporal":[202],"critical":[206],"perception":[209],"This":[213],"makes":[214],"it":[215],"strong":[217],"drop-in":[218],"compute-efficient":[221],"enables":[224],"high":[225],"resource-constrained":[228],"settings.":[229],"Code:":[230],"https://github.com/tencent-ailab/Penguin-VL":[231]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-10T00:00:00"}
