{"id":"https://openalex.org/W4411403450","doi":"https://doi.org/10.1145/3725338","title":"PQCache: Product Quantization-based KVCache for Long Context LLM Inference","display_name":"PQCache: Product Quantization-based KVCache for Long Context LLM Inference","publication_year":2025,"publication_date":"2025-06-17","ids":{"openalex":"https://openalex.org/W4411403450","doi":"https://doi.org/10.1145/3725338"},"language":"en","primary_location":{"id":"doi:10.1145/3725338","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3725338","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100462441","display_name":"Hailin Zhang","orcid":"https://orcid.org/0009-0000-4188-7742"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hailin Zhang","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103849105","display_name":"X. L. Ji","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaodong Ji","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032095710","display_name":"Yilin Chen","orcid":"https://orcid.org/0009-0005-7251-3475"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yilin Chen","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039254679","display_name":"Fangcheng Fu","orcid":"https://orcid.org/0000-0003-1658-0380"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fangcheng Fu","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015552951","display_name":"Xupeng Miao","orcid":"https://orcid.org/0000-0002-9371-8358"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xupeng Miao","raw_affiliation_strings":["Purdue University, West Lafayette, USA"],"affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059601307","display_name":"Xiaonan Nie","orcid":"https://orcid.org/0000-0001-6766-757X"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaonan Nie","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035684435","display_name":"Weipeng Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weipeng Chen","raw_affiliation_strings":["Baichuan Inc., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baichuan Inc., Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5062357883","display_name":"Bin Cui","orcid":"https://orcid.org/0000-0003-1681-4677"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bin Cui","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5100462441"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":19.879,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.99146841,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":"3","issue":"3","first_page":"1","last_page":"30"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9908000230789185,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7881680727005005},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5654473304748535},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5590077042579651},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5042773485183716},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.49770501255989075},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.48854097723960876},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.45954057574272156},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.4549429714679718},{"id":"https://openalex.org/keywords/computer-engineering","display_name":"Computer engineering","score":0.324264258146286},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.298004150390625},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2712637782096863},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.26171016693115234},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.1103898286819458},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.08450737595558167}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7881680727005005},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5654473304748535},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5590077042579651},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5042773485183716},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.49770501255989075},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.48854097723960876},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.45954057574272156},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.4549429714679718},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.324264258146286},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.298004150390625},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2712637782096863},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26171016693115234},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.1103898286819458},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.08450737595558167}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3725338","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3725338","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":52,"referenced_works":["https://openalex.org/W398859631","https://openalex.org/W2124509324","https://openalex.org/W2204555070","https://openalex.org/W2489529491","https://openalex.org/W2747329762","https://openalex.org/W2765113847","https://openalex.org/W2767718834","https://openalex.org/W2779483754","https://openalex.org/W2888482885","https://openalex.org/W2913446918","https://openalex.org/W2930957955","https://openalex.org/W2962814013","https://openalex.org/W2963284996","https://openalex.org/W2963469388","https://openalex.org/W2998702515","https://openalex.org/W3011493836","https://openalex.org/W3094858795","https://openalex.org/W3098486933","https://openalex.org/W3173455118","https://openalex.org/W3196481040","https://openalex.org/W3205803342","https://openalex.org/W4210352519","https://openalex.org/W4244796164","https://openalex.org/W4251828973","https://openalex.org/W4284685333","https://openalex.org/W4285108565","https://openalex.org/W4285355789","https://openalex.org/W4289533987","https://openalex.org/W4311209912","https://openalex.org/W4317212783","https://openalex.org/W4366492471","https://openalex.org/W4381610063","https://openalex.org/W4385227045","https://openalex.org/W4385567149","https://openalex.org/W4387321091","https://openalex.org/W4389519226","https://openalex.org/W4390263770","https://openalex.org/W4391032878","https://openalex.org/W4392453885","https://openalex.org/W4393180038","https://openalex.org/W4399175194","https://openalex.org/W4400893425","https://openalex.org/W4401176373","https://openalex.org/W4401812232","https://openalex.org/W4401834466","https://openalex.org/W4402665833","https://openalex.org/W4402671766","https://openalex.org/W4402692722","https://openalex.org/W4403579163","https://openalex.org/W6600020652","https://openalex.org/W6600561556","https://openalex.org/W6600725278"],"related_works":["https://openalex.org/W2595172197","https://openalex.org/W2084856301","https://openalex.org/W2127970246","https://openalex.org/W2885125400","https://openalex.org/W1989889224","https://openalex.org/W3204400881","https://openalex.org/W3214410901","https://openalex.org/W3204296682","https://openalex.org/W3183118997","https://openalex.org/W2917767146"],"abstract_inverted_index":{"As":[0],"the":[1,11,22,33,57,83,89,122,138,156],"field":[2],"of":[3,25,93,166],"Large":[4],"Language":[5],"Models":[6],"(LLMs)":[7],"continues":[8],"to":[9,38,55,110,128,148],"evolve,":[10],"context":[12],"length":[13],"in":[14,53,64,70,82,204],"inference":[15],"is":[16],"steadily":[17],"growing.":[18],"Key-Value":[19],"Cache":[20],"(KVCache),":[21],"intermediate":[23],"representations":[24],"tokens":[26],"within":[27],"LLM":[28,133],"inference,":[29],"has":[30],"now":[31],"become":[32],"primary":[34],"memory":[35],"bottleneck":[36],"due":[37],"limited":[39],"GPU":[40],"memory.":[41],"Current":[42],"methods":[43,197],"selectively":[44],"determine":[45],"suitable":[46],"keys":[47,130],"and":[48,91,135,146,168,175,189,200,207],"values":[49],"for":[50,131,160],"self-attention":[51,161],"computation":[52,174],"LLMs":[54],"address":[56],"issue.":[58],"However,":[59],"they":[60],"either":[61],"fall":[62],"short":[63],"maintaining":[65,113],"model":[66,114],"quality":[67,115],"or":[68],"result":[69],"high":[71],"serving":[72,119],"latency.":[73,120],"Drawing":[74],"inspiration":[75],"from":[76],"advanced":[77],"embedding":[78,98],"retrieval":[79,92,99],"techniques":[80],"prevalent":[81],"data":[84],"management":[85],"community,":[86],"we":[87,125,142,170],"consider":[88],"storage":[90],"KVCache":[94],"as":[95],"a":[96],"typical":[97],"problem.":[100],"We":[101],"propose":[102],"PQCache":[103,185],",":[104],"which":[105],"employs":[106],"Product":[107],"Quantization":[108],"(PQ)":[109],"manage":[111],"KVCache,":[112],"while":[116],"ensuring":[117],"low":[118,201],"During":[121,137],"prefilling":[123,206],"phase,":[124,141],"apply":[126],"PQ":[127,144],"tokens'":[129],"each":[132],"layer":[134],"head.":[136],"autoregressive":[139],"decoding":[140],"use":[143],"codes":[145],"centroids":[147],"approximately":[149],"identify":[150],"important":[151],"preceding":[152],"tokens,":[153],"then":[154],"fetch":[155],"corresponding":[157],"key-value":[158],"pairs":[159],"computation.":[162],"Through":[163],"meticulous":[164],"design":[165],"overlapping":[167],"caching,":[169],"minimize":[171],"any":[172],"additional":[173],"communication":[176],"overhead":[177],"during":[178],"both":[179,187,205],"phases.":[180],"Extensive":[181],"experiments":[182],"demonstrate":[183],"that":[184],"achieves":[186],"effectiveness":[188],"efficiency,":[190],"with":[191],"4.60%":[192],"score":[193],"improvement":[194],"over":[195],"existing":[196],"on":[198],"InfiniteBench":[199],"system":[202],"latency":[203],"decoding.":[208]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":5}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
