{"id":"https://openalex.org/W4411487104","doi":"https://doi.org/10.1145/3695053.3731019","title":"Oaken: Fast and Efficient LLM Serving with Online-Offline Hybrid KV Cache Quantization","display_name":"Oaken: Fast and Efficient LLM Serving with Online-Offline Hybrid KV Cache Quantization","publication_year":2025,"publication_date":"2025-06-20","ids":{"openalex":"https://openalex.org/W4411487104","doi":"https://doi.org/10.1145/3695053.3731019"},"language":"en","primary_location":{"id":"doi:10.1145/3695053.3731019","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3695053.3731019","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731019","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731019","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Minsu Kim","orcid":"https://orcid.org/0009-0003-8751-0352"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Minsu Kim","raw_affiliation_strings":["KAIST, Daejeon, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"KAIST, Daejeon, Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101128416","display_name":"Seongmin Hong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seongmin Hong","raw_affiliation_strings":["HyperAccel, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"HyperAccel, Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046602398","display_name":"Ryeowook Ko","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"RyeoWook Ko","raw_affiliation_strings":["KAIST, Daejeon, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"KAIST, Daejeon, Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111252036","display_name":"Soongyu Choi","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Soongyu Choi","raw_affiliation_strings":["KAIST, Daejeon, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"KAIST, Daejeon, Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046228864","display_name":"Hunjong Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hunjong Lee","raw_affiliation_strings":["HyperAccel, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"HyperAccel, Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100636164","display_name":"Junsoo Kim","orcid":"https://orcid.org/0000-0001-6680-2602"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junsoo Kim","raw_affiliation_strings":["HyperAccel, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"HyperAccel, Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100447377","display_name":"Joo-Young Kim","orcid":"https://orcid.org/0000-0003-1099-1496"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joo-Young Kim","raw_affiliation_strings":["HyperAccel, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"HyperAccel, Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5037553165","display_name":"Jongse Park","orcid":"https://orcid.org/0000-0002-6629-449X"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jongse Park","raw_affiliation_strings":["KAIST, Daejeon, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"KAIST, Daejeon, Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I157485424"],"apc_list":null,"apc_paid":null,"fwci":5.6644,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.9592716,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"482","last_page":"497"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9922999739646912,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9911999702453613,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.74821937084198},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.6959773302078247},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.31603771448135376}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.74821937084198},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.6959773302078247},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.31603771448135376}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3695053.3731019","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3695053.3731019","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731019","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2503.18599","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.18599","pdf_url":"https://arxiv.org/pdf/2503.18599","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3695053.3731019","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3695053.3731019","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731019","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3034753964","display_name":null,"funder_award_id":"grant","funder_id":"https://openalex.org/F4320320671","funder_display_name":"National Research Foundation"},{"id":"https://openalex.org/G342704958","display_name":null,"funder_award_id":"funded","funder_id":"https://openalex.org/F4320322120","funder_display_name":"National Research Foundation of Korea"},{"id":"https://openalex.org/G515828364","display_name":null,"funder_award_id":"No. RS-","funder_id":"https://openalex.org/F4320322120","funder_display_name":"National Research Foundation of Korea"},{"id":"https://openalex.org/G6072120315","display_name":null,"funder_award_id":"funded","funder_id":"https://openalex.org/F4320335489","funder_display_name":"Institute for Information and Communications Technology Promotion"},{"id":"https://openalex.org/G626505518","display_name":null,"funder_award_id":"No. 201","funder_id":"https://openalex.org/F4320322120","funder_display_name":"National Research Foundation of Korea"},{"id":"https://openalex.org/G79798675","display_name":null,"funder_award_id":"2018-0-00503","funder_id":"https://openalex.org/F4320335489","funder_display_name":"Institute for Information and Communications Technology Promotion"}],"funders":[{"id":"https://openalex.org/F4320320671","display_name":"National Research Foundation","ror":"https://ror.org/05s0g1g46"},{"id":"https://openalex.org/F4320322120","display_name":"National Research Foundation of Korea","ror":"https://ror.org/013aysd81"},{"id":"https://openalex.org/F4320328359","display_name":"Ministry of Science and ICT, South Korea","ror":"https://ror.org/01wpjm123"},{"id":"https://openalex.org/F4320335489","display_name":"Institute for Information and Communications Technology Promotion","ror":"https://ror.org/01g0hqq23"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4411487104.pdf","grobid_xml":"https://content.openalex.org/works/W4411487104.grobid-xml"},"referenced_works_count":40,"referenced_works":["https://openalex.org/W2734941459","https://openalex.org/W2946609015","https://openalex.org/W2962821792","https://openalex.org/W2990138404","https://openalex.org/W3006732000","https://openalex.org/W3016832937","https://openalex.org/W3043504674","https://openalex.org/W3100985894","https://openalex.org/W3158020960","https://openalex.org/W3158275024","https://openalex.org/W3159727696","https://openalex.org/W4214734582","https://openalex.org/W4280496502","https://openalex.org/W4281660701","https://openalex.org/W4285507310","https://openalex.org/W4286571858","https://openalex.org/W4294031337","https://openalex.org/W4300865759","https://openalex.org/W4308083513","https://openalex.org/W4360831846","https://openalex.org/W4366341968","https://openalex.org/W4380874786","https://openalex.org/W4380881077","https://openalex.org/W4387064012","https://openalex.org/W4387321091","https://openalex.org/W4389476354","https://openalex.org/W4389518760","https://openalex.org/W4389521054","https://openalex.org/W4389523971","https://openalex.org/W4392427708","https://openalex.org/W4393147284","https://openalex.org/W4393152626","https://openalex.org/W4393160423","https://openalex.org/W4393407316","https://openalex.org/W4393578753","https://openalex.org/W4394998968","https://openalex.org/W4395073431","https://openalex.org/W4401211590","https://openalex.org/W4401211642","https://openalex.org/W4401211807"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Modern":[0],"Large":[1],"Language":[2],"Model":[3],"(LLM)":[4],"serving":[5],"system":[6],"batches":[7],"multiple":[8,37],"requests":[9],"to":[10,26,56,197],"achieve":[11,120],"high":[12,43,121,158,161],"throughput,":[13],"while":[14],"batching":[15],"attention":[16],"operations":[17],"is":[18,139],"challenging,":[19],"rendering":[20],"memory":[21,39,52,85,220],"bandwidth":[22,44],"a":[23,171],"critical":[24],"bottleneck.Today,":[25],"mitigate":[27],"this":[28,117],"issue,":[29],"the":[30,48,75,82,88,90,130,133,143,175,199,204],"community":[31],"relies":[32],"on":[33,84,235],"high-end":[34],"GPUs":[35],"with":[36,129,215,227],"high-bandwidth":[38],"(HBM)":[40],"channels.Unfortunately,":[41],"HBM's":[42],"often":[45],"comes":[46,128,214],"at":[47],"expense":[49],"of":[50,145,179,237],"limited":[51],"capacity,":[53],"necessitating":[54],"systems":[55],"scale,":[57],"which":[58,98,193],"reduces":[59],"core":[60],"utilization":[61],"and":[62,123,160,167,219],"increases":[63],"costs.Moreover,":[64],"recent":[65],"advancements":[66],"enabling":[67],"longer":[68],"contexts":[69],"for":[70,105,114,135],"LLMs":[71],"have":[72],"substantially":[73],"increased":[74],"key-value":[76],"(KV)":[77],"cache":[78,95,181],"size,":[79],"further":[80],"intensifying":[81],"pressures":[83],"capacity.To":[86],"lower":[87],"pressure,":[89],"literature":[91],"has":[92],"explored":[93],"KV":[94,180],"quantization":[96,200],"techniques,":[97],"commonly":[99],"use":[100],"low":[101,124],"bitwidth":[102,111,125],"(e.g.,":[103,112],"INT4)":[104],"most":[106],"values,":[107],"selectively":[108],"using":[109],"higher":[110],"FP16)":[113],"outlier":[115,137,190],"values.While":[116],"approach":[118],"helps":[119],"accuracy":[122,159],"simultaneously,":[126],"it":[127],"limitation":[131],"that":[132,156,223],"cost":[134],"online":[136],"detection":[138],"excessively":[140],"high,":[141],"negating":[142],"advantages":[144],"quantization.Inspired":[146],"by":[147],"these":[148],"insights,":[149],"we":[150],"propose":[151],"Oaken,":[152],"an":[153,185,232],"acceleration":[154],"solution":[155],"achieves":[157],"performance":[162,210],"simultaneously":[163],"through":[164],"co-designing":[165],"algorithm":[166],"hardware.To":[168],"effectively":[169],"find":[170],"sweet":[172],"spot":[173],"in":[174],"accuracy-performance":[176],"trade-off":[177],"space":[178],"quantization,":[182],"Oaken":[183,212,233],"employs":[184],"online-offline":[186],"hybrid":[187],"approach,":[188],"setting":[189],"thresholds":[191],"offline,":[192],"are":[194],"then":[195],"used":[196],"determine":[198],"scale":[201],"online.To":[202],"translate":[203],"proposed":[205],"algorithmic":[206],"technique":[207],"into":[208],"tangible":[209],"gains,":[211],"also":[213],"custom":[216],"quantization/dequantization":[217],"engines":[218],"management":[221],"units":[222],"can":[224],"be":[225],"integrated":[226],"any":[228],"LLM":[229],"accelerators.We":[230],"built":[231],"accelerator":[234],"top":[236]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
