{"id":"https://openalex.org/W7151590383","doi":"https://doi.org/10.48550/arxiv.2604.04722","title":"Don't Waste Bits! Adaptive KV-Cache Quantization for Lightweight On-Device LLMs","display_name":"Don't Waste Bits! Adaptive KV-Cache Quantization for Lightweight On-Device LLMs","publication_year":2026,"publication_date":"2026-04-06","ids":{"openalex":"https://openalex.org/W7151590383","doi":"https://doi.org/10.48550/arxiv.2604.04722"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04722","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04722","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04722","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133086377","display_name":"Sayed Pedram Haeri Boroujeni","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Boroujeni, Sayed Pedram Haeri","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082851711","display_name":"Niloufar Mehrabi","orcid":"https://orcid.org/0000-0001-6507-0243"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mehrabi, Niloufar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133065375","display_name":"Patrick Woods","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Woods, Patrick","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133075122","display_name":"Gabriel Hillesheim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hillesheim, Gabriel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133109963","display_name":"Abolfazl Razi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Razi, Abolfazl","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5133086377"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.2304999977350235,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.2304999977350235,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.188400000333786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.09939999878406525,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.6866000294685364},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6098999977111816},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5306000113487244},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5092999935150146},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4740000069141388},{"id":"https://openalex.org/keywords/edge-device","display_name":"Edge device","score":0.36500000953674316}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7638000249862671},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.6866000294685364},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6098999977111816},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5306000113487244},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5092999935150146},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4740000069141388},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.41499999165534973},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3709999918937683},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.36500000953674316},{"id":"https://openalex.org/C46900642","wikidata":"https://www.wikidata.org/wiki/Q2647","display_name":"Huffman coding","level":3,"score":0.34040001034736633},{"id":"https://openalex.org/C74912251","wikidata":"https://www.wikidata.org/wiki/Q6815727","display_name":"Memory footprint","level":2,"score":0.33880001306533813},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.3375999927520752},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.3172000050544739},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.27459999918937683},{"id":"https://openalex.org/C30390489","wikidata":"https://www.wikidata.org/wiki/Q4680748","display_name":"Adaptive memory","level":3,"score":0.267300009727478}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04722","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04722","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04722","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04722","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.4781671166419983}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4],"achieved":[5],"remarkable":[6],"progress":[7],"across":[8,178,184],"reasoning,":[9],"generation,":[10],"and":[11,20,35,49,107,126,129,157,167,170,192,227],"decision-making":[12],"tasks,":[13],"yet":[14],"deploying":[15],"them":[16,131],"on":[17,60,69,207],"mobile,":[18],"embedded,":[19],"edge":[21],"devices":[22],"remains":[23,228],"particularly":[24],"challenging.":[25],"On-device":[26],"LLM":[27,180],"inference":[28,177],"is":[29],"heavily":[30],"constrained":[31],"by":[32,82,215,224],"the":[33,39,200],"memory":[34,106,155],"bandwidth":[36],"overhead":[37],"of":[38,86,233],"key-value":[40],"(KV)":[41],"cache,":[42],"which":[43],"grows":[44],"linearly":[45],"with":[46,205],"context":[47],"length":[48],"often":[50],"dominates":[51],"decoding":[52,212],"cost.":[53],"Existing":[54],"KV-cache":[55,92],"quantization":[56,166],"schemes":[57],"typically":[58],"rely":[59],"fixed":[61],"precision":[62,141,151],"or":[63],"hand-crafted":[64],"heuristics,":[65],"thereby":[66],"wasting":[67],"bits":[68],"low-impact":[70],"tokens":[71],"while":[72,159],"over-compressing":[73],"informative":[74],"ones,":[75],"leading":[76],"to":[77,101,163,175,218],"avoidable":[78],"accuracy":[79,161,173,223],"degradation.":[80],"Inspired":[81],"Huffman":[83],"coding's":[84],"principle":[85],"variable-length":[87],"allocation,":[88],"we":[89],"propose":[90],"adaptive":[91,150],"quantization,":[93,221],"a":[94,133],"learned":[95],"policy":[96,152],"that":[97,137,195],"assigns":[98],"bit-width":[99],"proportional":[100],"token":[102,120],"importance,":[103],"minimizing":[104],"expected":[105],"latency":[108,158,213],"without":[109],"sacrificing":[110],"competitive":[111,172],"accuracy.":[112],"Our":[113],"framework":[114],"extracts":[115],"lightweight":[116],"token-level":[117],"features,":[118],"including":[119],"frequency,":[121],"quality":[122],"score,":[123],"attention":[124],"variance,":[125],"entropy-based":[127],"uncertainty,":[128],"feeds":[130],"into":[132],"compact":[134],"data-driven":[135],"controller":[136,197],"dynamically":[138],"selects":[139],"KV":[140,154,165,220],"from":[142],"{2-bit,":[143],"4-bit,":[144],"8-bit,":[145],"FP16}":[146],"during":[147],"decoding.":[148],"This":[149],"reduces":[153,211],"footprint":[156],"improving":[160],"compared":[162],"static":[164,219],"rule-based":[168],"baselines,":[169],"maintaining":[171],"close":[174],"FP16":[176,234],"standard":[179],"benchmarks.":[181],"Extensive":[182],"experiments":[183],"multiple":[185],"commonsense":[186],"reasoning":[187],"benchmarks":[188],"using":[189],"SmolLM-135M,":[190],"SmolLM-360M,":[191],"SmolLM-1.7B":[193],"demonstrate":[194],"our":[196,209],"consistently":[198],"improves":[199,222],"accuracy-latency":[201],"trade-off.":[202],"For":[203],"instance,":[204],"SmolLM-360M":[206],"HellaSwag,":[208],"method":[210],"(ms/token)":[214],"17.75%":[216],"relative":[217],"7.60":[225],"points,":[226],"within":[229],"only":[230],"0.30":[231],"points":[232],"inference.":[235]},"counts_by_year":[],"updated_date":"2026-04-08T06:07:18.267832","created_date":"2026-04-08T00:00:00"}
