{"id":"https://openalex.org/W7161684918","doi":"https://doi.org/10.48550/arxiv.2605.16439","title":"KVCapsule: Efficient Sequential KV Cache Compression for Vision-Language Models with Asymmetric Redundancy","display_name":"KVCapsule: Efficient Sequential KV Cache Compression for Vision-Language Models with Asymmetric Redundancy","publication_year":2026,"publication_date":"2026-05-14","ids":{"openalex":"https://openalex.org/W7161684918","doi":"https://doi.org/10.48550/arxiv.2605.16439"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.16439","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16439","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.16439","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114058811","display_name":"Yingbing Huang","orcid":"https://orcid.org/0009-0004-8244-2917"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yingbing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055690325","display_name":"Tharun Adithya Srikrishnan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Srikrishnan, Tharun Adithya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109010789","display_name":"Steven K. Reinhardt","orcid":"https://orcid.org/0000-0002-2479-0030"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Reinhardt, Steven K.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136475028","display_name":"Deming Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Deming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8356000185012817,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8356000185012817,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.018300000578165054,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.01810000091791153,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7475000023841858},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.6191999912261963},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.511900007724762},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.45910000801086426},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.44209998846054077},{"id":"https://openalex.org/keywords/image-compression","display_name":"Image compression","score":0.4327999949455261},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4296000003814697},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4277999997138977},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4235999882221222}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8621000051498413},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7475000023841858},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.6191999912261963},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.511900007724762},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.45910000801086426},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.44209998846054077},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.43880000710487366},{"id":"https://openalex.org/C13481523","wikidata":"https://www.wikidata.org/wiki/Q412438","display_name":"Image compression","level":4,"score":0.4327999949455261},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4296000003814697},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4277999997138977},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4235999882221222},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.4081999957561493},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.39070001244544983},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.3815999925136566},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.37470000982284546},{"id":"https://openalex.org/C3720319","wikidata":"https://www.wikidata.org/wiki/Q5015937","display_name":"Cache-only memory architecture","level":5,"score":0.36010000109672546},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.35030001401901245},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3474999964237213},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.33889999985694885},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3334999978542328},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.3237999975681305},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.296999990940094},{"id":"https://openalex.org/C27602214","wikidata":"https://www.wikidata.org/wiki/Q1868547","display_name":"Locality of reference","level":3,"score":0.28130000829696655},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2700999975204468},{"id":"https://openalex.org/C68767595","wikidata":"https://www.wikidata.org/wiki/Q1677999","display_name":"Contiguity","level":2,"score":0.26649999618530273},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26460000872612},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.2614000141620636}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.16439","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16439","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.16439","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16439","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1,14],"(VLMs)":[2],"have":[3],"emerged":[4],"as":[5],"a":[6,107,132,193],"critical":[7,119],"and":[8,23,37,68,78,156,166,175,185,218],"fast-growing":[9],"extension":[10],"of":[11,31,81,111,114],"Large":[12],"Language":[13],"(LLMs)":[15],"that":[16,88],"enable":[17],"multimodal":[18,227],"reasoning":[19],"through":[20,163],"both":[21],"text":[22],"image":[24],"inputs.":[25],"Although":[26],"VLMs":[27,162,174],"enrich":[28],"the":[29,42,47,76,112,118,143,152],"capabilities":[30],"language":[32],"models,":[33],"they":[34],"also":[35],"inherit":[36],"amplify":[38],"key":[39],"computational":[40],"bottlenecks:":[41],"memory":[43,191,216],"overhead":[44],"caused":[45],"by":[46],"large":[48],"key-value":[49],"(KV)":[50],"cache":[51,93,135,190,224],"during":[52],"autoregressive":[53],"decoding.":[54],"This":[55],"challenge":[56],"is":[57],"particularly":[58],"severe":[59],"in":[60,183,188,200],"VLMs,":[61],"where":[62],"images":[63],"produce":[64],"longer":[65],"token":[66],"sequences":[67],"denser":[69],"feature":[70],"representations":[71],"compared":[72],"to":[73,100,151,180,210],"text.":[74],"Moreover,":[75],"spatial":[77],"information-rich":[79],"nature":[80],"vision":[82,115,139],"tokens":[83],"introduces":[84],"structured":[85],"attention":[86,153],"patterns":[87],"make":[89],"many":[90],"LLM-oriented":[91],"KV":[92,134,189],"compression":[94,136,165,195,225],"techniques":[95],"ineffective":[96],"when":[97],"applied":[98],"directly":[99],"VLMs.":[101],"In":[102],"this":[103],"work,":[104],"we":[105,129],"conduct":[106],"detailed":[108],"empirical":[109],"analysis":[110],"behavior":[113],"tokens,":[116],"highlighting":[117],"differences":[120],"from":[121],"purely":[122],"text-based":[123],"models.":[124,228],"Based":[125],"on":[126,172],"these":[127],"insights,":[128],"propose":[130],"KVCapsule,":[131],"novel":[133],"framework":[137],"for":[138,226],"tokens.":[140],"KVCapsule":[141,171],"keeps":[142],"pretrained":[144],"VLM":[145,212],"backbone":[146],"frozen,":[147],"requires":[148],"no":[149],"modification":[150],"computation":[154],"modules,":[155],"can":[157],"be":[158],"integrated":[159],"into":[160,222],"existing":[161],"lightweight":[164],"reconstruction":[167],"components.":[168],"We":[169],"evaluate":[170],"multiple":[173],"benchmark":[176],"tasks,":[177],"demonstrating":[178],"up":[179],"2x":[181],"improvement":[182],"TPS":[184],"2.4x":[186],"reduction":[187],"at":[192],"60%":[194],"ratio,":[196],"with":[197],"negligible":[198],"degradation":[199],"accuracy":[201],"or":[202],"response":[203],"quality.":[204],"Our":[205],"findings":[206],"offer":[207],"practical":[208],"pathways":[209],"scale":[211],"inference":[213],"under":[214],"constrained":[215],"budgets":[217],"inspire":[219],"further":[220],"research":[221],"structure-aware":[223]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
