{"id":"https://openalex.org/W7140133911","doi":"https://doi.org/10.48550/arxiv.2603.19664","title":"The Residual Stream Is All You Need: On the Redundancy of the KV Cache in Transformer Inference","display_name":"The Residual Stream Is All You Need: On the Redundancy of the KV Cache in Transformer Inference","publication_year":2026,"publication_date":"2026-03-20","ids":{"openalex":"https://openalex.org/W7140133911","doi":"https://doi.org/10.48550/arxiv.2603.19664"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.19664","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19664","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.19664","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130385844","display_name":"Kaleem Ullah Qasim","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Qasim, Kaleem Ullah","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059626844","display_name":"Jiashu Zhang","orcid":"https://orcid.org/0000-0003-2875-3293"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jiashu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130409841","display_name":"Muhammad Kafeel Shaheen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shaheen, Muhammad Kafeel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093920864","display_name":"Razan Alharith","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Alharith, Razan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5063311758","display_name":"Heying Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Heying","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5130385844"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6672999858856201,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6672999858856201,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.08749999850988388,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.036400001496076584,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7584999799728394},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7289000153541565},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.7254999876022339},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.47769999504089355},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.36160001158714294},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.3199999928474426}],"concepts":[{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7584999799728394},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7289000153541565},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.7254999876022339},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.694599986076355},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5515999794006348},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.47769999504089355},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4009999930858612},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.36160001158714294},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.3199999928474426},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.30079999566078186},{"id":"https://openalex.org/C98763669","wikidata":"https://www.wikidata.org/wiki/Q176645","display_name":"Markov chain","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.2786000072956085},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.2667999863624573},{"id":"https://openalex.org/C25536678","wikidata":"https://www.wikidata.org/wiki/Q5015977","display_name":"Cache invalidation","level":5,"score":0.2614000141620636},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2556000053882599}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.19664","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19664","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.19664","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19664","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"key-value":[1],"(KV)":[2],"cache":[3,115,180,201],"is":[4,33,108,228],"widely":[5],"treated":[6],"as":[7],"essential":[8],"state":[9,32],"in":[10],"transformer":[11],"inference,":[12],"and":[13,37,49,94,107,117,162],"a":[14,53,104,138],"large":[15],"body":[16],"of":[17,45,154],"work":[18],"engineers":[19],"policies":[20],"to":[21,79,206,216],"compress,":[22],"evict,":[23],"or":[24],"approximate":[25],"its":[26],"entries.":[27],"We":[28,68,131],"prove":[29],"that":[30,99,142],"this":[31,70,134],"entirely":[34,116],"redundant:":[35],"keys":[36,161],"values":[38,163],"at":[39,85,174,199,223,230],"every":[40,86,200],"layer":[41,87],"are":[42],"deterministic":[43],"projections":[44],"the":[46,100,109,114,178],"residual":[47,55,83,101,144],"stream,":[48],"recomputing":[50,118,160],"them":[51],"from":[52,74,119],"single":[54],"vector":[56],"per":[57,148],"token":[58,149,197],"incurs":[59],"exactly":[60],"zero":[61],"reconstruction":[62],"error,":[63],"not":[64],"approximately,":[65],"but":[66],"bit-identically.":[67],"verify":[69],"across":[71],"six":[72],"models":[73,129],"four":[75],"architecture":[76],"families":[77],"(135M":[78],"4B":[80],"parameters).":[81],"Cross-task":[82],"patching":[84],"produces":[88],"D_KL":[89],"=":[90],"0":[91],"between":[92],"patched":[93],"original":[95],"output":[96,123],"distributions,":[97],"confirming":[98],"stream":[102],"satisfies":[103],"Markov":[105],"property":[106],"sole":[110],"information-carrying":[111],"state.":[112],"Removing":[113],"scratch":[120],"yields":[121],"token-identical":[122],"under":[124],"greedy":[125],"decoding":[126],"on":[127,133,150,164],"all":[128,203],"tested.":[130],"build":[132],"result":[135],"with":[136],"KV-Direct,":[137],"bounded-memory":[139],"inference":[140],"scheme":[141],"checkpoints":[143],"vectors":[145],"(5":[146],"KB":[147],"Gemma":[151],"3-4B)":[152],"instead":[153],"full":[155],"KV":[156],"pairs":[157],"(136":[158],"KB),":[159],"demand.":[165],"Over":[166],"20":[167],"conversation":[168],"turns,":[169],"KV-Direct":[170,194],"holds":[171],"peak":[172],"memory":[173],"42":[175],"MB":[176],"while":[177],"standard":[179],"grows":[181],"past":[182],"103":[183],"MB.":[184],"Against":[185],"five":[186],"eviction":[187],"baselines":[188,204],"(H2O,":[189],"StreamingLLM,":[190],"SnapKV,":[191],"TOVA,":[192],"window-only),":[193],"maintains":[195],"100%":[196],"match":[198],"budget;":[202],"degrade":[205],"5-28%.":[207],"A":[208],"per-operation":[209],"latency":[210],"analysis":[211],"shows":[212],"recomputation":[213],"runs":[214],"up":[215],"5x":[217],"faster":[218],"than":[219],"reading":[220],"cached":[221],"tensors":[222],"moderate":[224],"batch":[225],"sizes.":[226],"Code":[227],"available":[229],"https://github.com/Kaleemullahqasim/KV-Direct.":[231]},"counts_by_year":[],"updated_date":"2026-03-24T06:04:31.470712","created_date":"2026-03-24T00:00:00"}
