{"id":"https://openalex.org/W4414197699","doi":"https://doi.org/10.1109/dac63849.2025.11132479","title":"ClusterKV: Manipulating LLM KV Cache in Semantic Space for Recallable Compression","display_name":"ClusterKV: Manipulating LLM KV Cache in Semantic Space for Recallable Compression","publication_year":2025,"publication_date":"2025-06-22","ids":{"openalex":"https://openalex.org/W4414197699","doi":"https://doi.org/10.1109/dac63849.2025.11132479"},"language":"en","primary_location":{"id":"doi:10.1109/dac63849.2025.11132479","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132479","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026393301","display_name":"Guangda Liu","orcid":"https://orcid.org/0009-0007-6108-688X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Guangda Liu","raw_affiliation_strings":["Shanghai Jiao Tong University,School of Computer Science"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,School of Computer Science","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107791120","display_name":"Chengwei Li","orcid":"https://orcid.org/0000-0001-6320-1713"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chengwei Li","raw_affiliation_strings":["Shanghai Jiao Tong University,School of Computer Science"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,School of Computer Science","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102267966","display_name":"Jieru Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jieru Zhao","raw_affiliation_strings":["Shanghai Jiao Tong University,School of Computer Science"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,School of Computer Science","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100438525","display_name":"Chengqi Zhang","orcid":"https://orcid.org/0000-0001-5715-7154"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chenqi Zhang","raw_affiliation_strings":["Shanghai Jiao Tong University,School of Computer Science"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,School of Computer Science","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5039318240","display_name":"Minyi Guo","orcid":"https://orcid.org/0000-0003-0034-2302"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minyi Guo","raw_affiliation_strings":["Shanghai Jiao Tong University,School of Computer Science"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,School of Computer Science","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5026393301"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":4.7137,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.95111393,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9914000034332275,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.991100013256073,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.8033999800682068},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.6363999843597412},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5493999719619751},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.5264999866485596},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.4805999994277954},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.47360000014305115},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.46459999680519104},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4352000057697296},{"id":"https://openalex.org/keywords/cache-coloring","display_name":"Cache coloring","score":0.4325000047683716}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.875},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.8033999800682068},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.6363999843597412},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5774000287055969},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5493999719619751},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.5264999866485596},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.4805999994277954},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.47360000014305115},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.46459999680519104},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4352000057697296},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.4325000047683716},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.4212000072002411},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4115000069141388},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.41119998693466187},{"id":"https://openalex.org/C25536678","wikidata":"https://www.wikidata.org/wiki/Q5015977","display_name":"Cache invalidation","level":5,"score":0.41019999980926514},{"id":"https://openalex.org/C59687516","wikidata":"https://www.wikidata.org/wiki/Q5015938","display_name":"Cache-oblivious algorithm","level":5,"score":0.39879998564720154},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.3905999958515167},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.3792000114917755},{"id":"https://openalex.org/C167713795","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"Smart Cache","level":5,"score":0.37380000948905945},{"id":"https://openalex.org/C141917322","wikidata":"https://www.wikidata.org/wiki/Q1025017","display_name":"Cache coherence","level":5,"score":0.3619000017642975},{"id":"https://openalex.org/C25797200","wikidata":"https://www.wikidata.org/wiki/Q828137","display_name":"Compression ratio","level":3,"score":0.31839999556541443},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.31690001487731934},{"id":"https://openalex.org/C190290938","wikidata":"https://www.wikidata.org/wiki/Q387015","display_name":"Trie","level":3,"score":0.3066999912261963},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.29580000042915344},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2870999872684479},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C2781357197","wikidata":"https://www.wikidata.org/wiki/Q5757597","display_name":"High memory","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25360000133514404},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dac63849.2025.11132479","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132479","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W4310494058","https://openalex.org/W4394998532","https://openalex.org/W4401211590","https://openalex.org/W4401211704","https://openalex.org/W4402671766"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4,58],"been":[5],"widely":[6],"deployed":[7],"in":[8,172,179],"a":[9,155,168,175],"variety":[10],"of":[11,44,86,119],"applications,":[12],"and":[13,27,48,98,104,124,128,134,164,174,194],"the":[14,84,95,117],"context":[15,33,151],"length":[16],"is":[17,205],"rapidly":[18],"increasing":[19],"to":[20,52,63,158,167,183],"handle":[21],"tasks":[22,147],"such":[23],"as":[24],"long-document":[25],"QA":[26],"complex":[28],"logical":[29],"reasoning.":[30],"However,":[31],"long":[32],"poses":[34],"significant":[35],"challenges":[36],"for":[37,76,130],"inference":[38,201],"efficiency,":[39],"including":[40],"high":[41],"memory":[42,54],"costs":[43],"key-value":[45],"(KV)":[46],"cache":[47,62,108,162],"increased":[49],"latency":[50,173],"due":[51],"extensive":[53],"accesses.":[55],"Recent":[56],"works":[57],"proposed":[59],"compressing":[60],"KV":[61,107,161,186],"approximate":[64],"computation,":[65],"but":[66],"these":[67],"methods":[68],"either":[69],"evict":[70],"tokens":[71,82,115],"permanently,":[72],"never":[73],"recalling":[74],"them":[75],"later":[77],"inference,":[78],"or":[79,199],"recall":[80],"previous":[81],"at":[83,116,207],"granularity":[85,118],"pages":[87],"divided":[88],"by":[89],"textual":[90],"positions.":[91],"Both":[92],"approaches":[93],"degrade":[94],"model":[96,192],"accuracy":[97,143,193],"output":[99,195],"quality.":[100],"To":[101],"achieve":[102],"efficient":[103,126],"accurate":[105],"recallable":[106,185],"compression,":[109],"we":[110],"introduce":[111],"ClusterKV,":[112],"which":[113],"recalls":[114],"semantic":[120],"clusters.":[121],"We":[122],"design":[123],"implement":[125],"algorithms":[127],"systems":[129],"clustering,":[131],"selection,":[132],"indexing":[133],"caching.":[135],"Experiment":[136],"results":[137],"show":[138],"that":[139],"ClusterKV":[140,189],"attains":[141],"negligible":[142],"loss":[144],"across":[145],"various":[146],"with":[148],"32":[149],"k":[150,157,160],"lengths,":[152],"using":[153],"only":[154],"1":[156],"2":[159],"budget,":[163],"achieves":[165],"up":[166],"$2":[169],"\\times$":[170,177],"speedup":[171],"$2.5":[176],"improvement":[178],"decoding":[180],"throughput.":[181],"Compared":[182],"SoTA":[184],"compression":[187],"methods,":[188],"demonstrates":[190],"higher":[191],"quality,":[196],"while":[197],"maintaining":[198],"exceeding":[200],"efficiency.":[202],"Our":[203],"code":[204],"available":[206],"https://github.com/sjtu-zhao-lab/ClusterKV.":[208]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
