{"id":"https://openalex.org/W4392363977","doi":"https://doi.org/10.48550/arxiv.2402.18096","title":"No Token Left Behind: Reliable KV Cache Compression via Importance-Aware Mixed Precision Quantization","display_name":"No Token Left Behind: Reliable KV Cache Compression via Importance-Aware Mixed Precision Quantization","publication_year":2024,"publication_date":"2024-02-28","ids":{"openalex":"https://openalex.org/W4392363977","doi":"https://doi.org/10.48550/arxiv.2402.18096"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2402.18096","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.18096","pdf_url":"https://arxiv.org/pdf/2402.18096","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2402.18096","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039524786","display_name":"June Yong Yang","orcid":"https://orcid.org/0009-0009-8799-889X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yang, June Yong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031724891","display_name":"Byeongwook Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Byeongwook","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087228701","display_name":"Jeongin Bae","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bae, Jeongin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015122589","display_name":"Beomseok Kwon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kwon, Beomseok","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011127697","display_name":"Gunho Park","orcid":"https://orcid.org/0000-0002-8078-4356"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Park, Gunho","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086698569","display_name":"Eunho Yang","orcid":"https://orcid.org/0000-0003-2188-0169"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Eunho","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055490416","display_name":"Se Jung Kwon","orcid":"https://orcid.org/0000-0003-3456-9038"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kwon, Se Jung","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101652725","display_name":"Dongsoo Lee","orcid":"https://orcid.org/0000-0002-4155-6940"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Dongsoo","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5039524786"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.993399977684021,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.993399977684021,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9746999740600586,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10052","display_name":"Medical Image Segmentation Techniques","score":0.967199981212616,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7668077945709229},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6055286526679993},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.579505443572998},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.5635836720466614},{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.43538498878479004},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.26973024010658264},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.21911337971687317},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.18963801860809326},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.11376366019248962},{"id":"https://openalex.org/keywords/composite-material","display_name":"Composite material","score":0.04710566997528076}],"concepts":[{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7668077945709229},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6055286526679993},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.579505443572998},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.5635836720466614},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.43538498878479004},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26973024010658264},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.21911337971687317},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.18963801860809326},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.11376366019248962},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.04710566997528076}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2402.18096","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.18096","pdf_url":"https://arxiv.org/pdf/2402.18096","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},{"id":"doi:10.48550/arxiv.2402.18096","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2402.18096","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2402.18096","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.18096","pdf_url":"https://arxiv.org/pdf/2402.18096","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W2967478618","https://openalex.org/W4385009901","https://openalex.org/W4385572700","https://openalex.org/W4307309205","https://openalex.org/W4288261899","https://openalex.org/W2997152889","https://openalex.org/W2357388125"],"abstract_inverted_index":{"Key-Value":[0],"(KV)":[1],"Caching":[2],"has":[3],"become":[4],"an":[5],"essential":[6],"technique":[7],"for":[8],"accelerating":[9],"the":[10,21,25,36,49,52,68,74,80,94,107,111,138,148,152,158,172,192,197,209],"inference":[11],"speed":[12],"and":[13,43,62,100,122,203,219,234],"throughput":[14],"of":[15,24,51,77,97,134],"generative":[16,81],"Large":[17],"Language":[18],"Models~(LLMs).":[19],"However,":[20],"memory":[22,72],"footprint":[23],"KV":[26,65,112,140,160,182,199,211],"cache":[27,37,69,98,186],"poses":[28],"a":[29,131,166,184,228],"critical":[30],"bottleneck":[31],"in":[32,110,118,137,201,213],"LLM":[33,220],"deployment":[34],"as":[35,106],"size":[38,42,50],"grows":[39],"with":[40],"batch":[41],"sequence":[44],"length,":[45],"often":[46],"surpassing":[47],"even":[48,130],"model":[53],"itself.":[54],"Although":[55],"recent":[56],"methods":[57],"were":[58],"proposed":[59,225],"to":[60,70,85,170,237],"select":[61],"evict":[63],"unimportant":[64],"pairs":[66,113,141,161,200,212],"from":[67],"reduce":[71],"consumption,":[73],"potential":[75],"ramifications":[76],"eviction":[78,99],"on":[79,216],"process":[82],"are":[83],"yet":[84],"be":[86,163],"thoroughly":[87],"examined.":[88],"In":[89],"this":[90],"paper,":[91],"we":[92,126,155,179],"examine":[93],"detrimental":[95],"impact":[96],"observe":[101,156],"that":[102,128,157,189,223],"unforeseen":[103],"risks":[104],"arise":[105],"information":[108,135],"contained":[109,136],"is":[114],"exhaustively":[115],"discarded,":[116],"resulting":[117],"safety":[119],"breaches,":[120],"hallucinations,":[121],"context":[123,193],"loss.":[124],"Surprisingly,":[125],"find":[127],"preserving":[129],"small":[132],"amount":[133],"evicted":[139,198],"via":[142],"reduced":[143],"precision":[144,169],"quantization":[145],"substantially":[146],"recovers":[147],"incurred":[149],"degradation.":[150],"On":[151],"other":[153,238],"hand,":[154],"important":[159,210],"must":[162],"kept":[164],"at":[165],"relatively":[167],"higher":[168],"safeguard":[171],"generation":[173,205],"quality.":[174],"Motivated":[175],"by":[176,195,207],"these":[177],"observations,":[178],"propose":[180],"\\textit{Mixed-precision":[181],"cache}~(MiKV),":[183],"reliable":[185],"compression":[187,232],"method":[188,226],"simultaneously":[190],"preserves":[191],"details":[194],"retaining":[196],"low-precision":[202],"ensure":[204],"quality":[206],"keeping":[208],"high-precision.":[214],"Experiments":[215],"diverse":[217],"benchmarks":[218],"backbones":[221],"show":[222],"our":[224],"offers":[227],"state-of-the-art":[229],"trade-off":[230],"between":[231],"ratio":[233],"performance,":[235],"compared":[236],"baselines.":[239]},"counts_by_year":[],"updated_date":"2026-03-14T08:43:22.919905","created_date":"2024-03-05T00:00:00"}
