{"id":"https://openalex.org/W4387321091","doi":"https://doi.org/10.1145/3600006.3613165","title":"Efficient Memory Management for Large Language Model Serving with PagedAttention","display_name":"Efficient Memory Management for Large Language Model Serving with PagedAttention","publication_year":2023,"publication_date":"2023-10-03","ids":{"openalex":"https://openalex.org/W4387321091","doi":"https://doi.org/10.1145/3600006.3613165"},"language":"en","primary_location":{"id":"doi:10.1145/3600006.3613165","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3600006.3613165","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3600006.3613165","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th Symposium on Operating Systems Principles","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3600006.3613165","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066299337","display_name":"Woosuk Kwon","orcid":"https://orcid.org/0009-0008-8870-4892"},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Woosuk Kwon","raw_affiliation_strings":["UC Berkeley, Berkeley, United States of America"],"affiliations":[{"raw_affiliation_string":"UC Berkeley, Berkeley, United States of America","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067888631","display_name":"Z. Li","orcid":"https://orcid.org/0009-0004-1534-9106"},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhuohan Li","raw_affiliation_strings":["UC Berkeley, Berkeley, United States of America"],"affiliations":[{"raw_affiliation_string":"UC Berkeley, Berkeley, United States of America","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083595182","display_name":"Siyuan Zhuang","orcid":"https://orcid.org/0009-0007-3787-0316"},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Siyuan Zhuang","raw_affiliation_strings":["UC Berkeley, Berkeley, USA"],"affiliations":[{"raw_affiliation_string":"UC Berkeley, Berkeley, USA","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083402013","display_name":"Ying Sheng","orcid":null},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]},{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ying Sheng","raw_affiliation_strings":["UC Berkeley and Stanford University, Berkeley, USA"],"affiliations":[{"raw_affiliation_string":"UC Berkeley and Stanford University, Berkeley, USA","institution_ids":["https://openalex.org/I95457486","https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013500109","display_name":"L Zheng","orcid":"https://orcid.org/0000-0002-6611-4612"},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lianmin Zheng","raw_affiliation_strings":["UC Berkeley, Berkeley, United States of America"],"affiliations":[{"raw_affiliation_string":"UC Berkeley, Berkeley, United States of America","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008633310","display_name":"Cody Hao Yu","orcid":"https://orcid.org/0000-0002-9298-6254"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cody Hao Yu","raw_affiliation_strings":["Independent Researcher, Berkeley, United States of America"],"affiliations":[{"raw_affiliation_string":"Independent Researcher, Berkeley, United States of America","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072427753","display_name":"Joseph E. Gonzalez","orcid":"https://orcid.org/0000-0003-2921-956X"},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Joseph Gonzalez","raw_affiliation_strings":["UC Berkeley, Berkeley, United States of America"],"affiliations":[{"raw_affiliation_string":"UC Berkeley, Berkeley, United States of America","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092145402","display_name":"Hao Zhang","orcid":"https://orcid.org/0009-0003-8392-3977"},"institutions":[{"id":"https://openalex.org/I36258959","display_name":"University of California San Diego","ror":"https://ror.org/0168r3w48","country_code":"US","type":"education","lineage":["https://openalex.org/I36258959"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hao Zhang","raw_affiliation_strings":["UC San Diego, La Jolla, United States of America"],"affiliations":[{"raw_affiliation_string":"UC San Diego, La Jolla, United States of America","institution_ids":["https://openalex.org/I36258959"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5041920173","display_name":"Ion Stoica","orcid":"https://orcid.org/0000-0002-5373-0088"},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ion Stoica","raw_affiliation_strings":["UC Berkeley, Berkeley, United States of America"],"affiliations":[{"raw_affiliation_string":"UC Berkeley, Berkeley, United States of America","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5066299337"],"corresponding_institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"],"apc_list":null,"apc_paid":null,"fwci":154.8093,"has_fulltext":true,"cited_by_count":899,"citation_normalized_percentile":{"value":0.99986185,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"611","last_page":"626"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.988099992275238,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9872000217437744,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8598029613494873},{"id":"https://openalex.org/keywords/paging","display_name":"Paging","score":0.7440685033798218},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.6487390995025635},{"id":"https://openalex.org/keywords/demand-paging","display_name":"Demand paging","score":0.6292343139648438},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5352877378463745},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.4912266135215759},{"id":"https://openalex.org/keywords/virtual-memory","display_name":"Virtual memory","score":0.4682157039642334},{"id":"https://openalex.org/keywords/cache-coloring","display_name":"Cache coloring","score":0.45709553360939026},{"id":"https://openalex.org/keywords/cache-pollution","display_name":"Cache pollution","score":0.44173580408096313},{"id":"https://openalex.org/keywords/page-cache","display_name":"Page cache","score":0.4315970540046692},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.42580246925354004},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.4250311255455017},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.406555712223053},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.3706657886505127},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.36251378059387207},{"id":"https://openalex.org/keywords/overlay","display_name":"Overlay","score":0.17822396755218506}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8598029613494873},{"id":"https://openalex.org/C50954386","wikidata":"https://www.wikidata.org/wiki/Q656083","display_name":"Paging","level":2,"score":0.7440685033798218},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.6487390995025635},{"id":"https://openalex.org/C188873839","wikidata":"https://www.wikidata.org/wiki/Q5255045","display_name":"Demand paging","level":5,"score":0.6292343139648438},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5352877378463745},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.4912266135215759},{"id":"https://openalex.org/C76399640","wikidata":"https://www.wikidata.org/wiki/Q189401","display_name":"Virtual memory","level":4,"score":0.4682157039642334},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.45709553360939026},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.44173580408096313},{"id":"https://openalex.org/C36340418","wikidata":"https://www.wikidata.org/wiki/Q7124288","display_name":"Page cache","level":5,"score":0.4315970540046692},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.42580246925354004},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.4250311255455017},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.406555712223053},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3706657886505127},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.36251378059387207},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.17822396755218506},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3600006.3613165","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3600006.3613165","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3600006.3613165","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th Symposium on Operating Systems Principles","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3600006.3613165","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3600006.3613165","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3600006.3613165","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th Symposium on Operating Systems Principles","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320316785","display_name":"VMware","ror":null},{"id":"https://openalex.org/F4320325295","display_name":"Zayed University","ror":"https://ror.org/03snqfa66"},{"id":"https://openalex.org/F4320332195","display_name":"Samsung","ror":"https://ror.org/04w3jy968"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4387321091.pdf","grobid_xml":"https://content.openalex.org/works/W4387321091.grobid-xml"},"referenced_works_count":19,"referenced_works":["https://openalex.org/W2164155474","https://openalex.org/W2194775991","https://openalex.org/W2512924740","https://openalex.org/W2734941459","https://openalex.org/W2798291715","https://openalex.org/W2914209329","https://openalex.org/W2979044977","https://openalex.org/W2979826702","https://openalex.org/W2982157693","https://openalex.org/W3012479151","https://openalex.org/W3037639655","https://openalex.org/W3095488153","https://openalex.org/W3130716829","https://openalex.org/W3172198372","https://openalex.org/W4226479682","https://openalex.org/W4281758439","https://openalex.org/W4301361180","https://openalex.org/W4307315283","https://openalex.org/W6862640317"],"related_works":["https://openalex.org/W2357520409","https://openalex.org/W4241723377","https://openalex.org/W2155379745","https://openalex.org/W4235954812","https://openalex.org/W4242544852","https://openalex.org/W1820333363","https://openalex.org/W2010739099","https://openalex.org/W2116040063","https://openalex.org/W2093630148","https://openalex.org/W1993535731"],"abstract_inverted_index":{"High":[0],"throughput":[1,120],"serving":[2,86],"of":[3,79,101,121,130],"large":[4],"language":[5],"models":[6],"(LLMs)":[7],"requires":[8],"batching":[9],"sufficiently":[10],"many":[11],"requests":[12,107],"at":[13,163],"a":[14],"time.":[15],"However,":[16],"existing":[17],"systems":[18],"struggle":[19],"because":[20],"the":[21,52,67,119,127,134],"key-value":[22],"cache":[23,95,103],"(KV":[24],"cache)":[25],"memory":[26,41,70,96,111],"for":[27],"each":[28],"request":[29],"is":[30,144,160],"huge":[31],"and":[32,34,48,71,97,105,140,152],"grows":[33],"shrinks":[35],"dynamically.":[36],"When":[37],"managed":[38],"inefficiently,":[39],"this":[40,57],"can":[42],"be":[43],"significantly":[44],"wasted":[45],"by":[46,66,124],"fragmentation":[47],"redundant":[49],"duplication,":[50],"limiting":[51],"batch":[53],"size.":[54],"To":[55],"address":[56],"problem,":[58],"we":[59,81],"propose":[60],"PagedAttention,":[61],"an":[62,84],"attention":[63],"algorithm":[64],"inspired":[65],"classical":[68],"virtual":[69],"paging":[72],"techniques":[73],"in":[74,93],"operating":[75],"systems.":[76],"On":[77],"top":[78],"it,":[80],"build":[82],"vLLM,":[83],"LLM":[85],"system":[87],"that":[88,116],"achieves":[89],"(1)":[90],"near-zero":[91],"waste":[92],"KV":[94,102],"(2)":[98],"flexible":[99],"sharing":[100],"within":[104],"across":[106],"to":[108,133],"further":[109],"reduce":[110],"usage.":[112],"Our":[113],"evaluations":[114],"show":[115],"vLLM":[117],"improves":[118],"popular":[122],"LLMs":[123],"2--4\u00d7":[125],"with":[126,147],"same":[128],"level":[129],"latency":[131],"compared":[132],"state-of-the-art":[135],"systems,":[136],"such":[137],"as":[138],"FasterTransformer":[139],"Orca.":[141],"The":[142],"improvement":[143],"more":[145,153],"pronounced":[146],"longer":[148],"sequences,":[149],"larger":[150],"models,":[151],"complex":[154],"decoding":[155],"algorithms.":[156],"vLLM's":[157],"source":[158],"code":[159],"publicly":[161],"available":[162],"https://github.com/vllm-project/vllm.":[164]},"counts_by_year":[{"year":2026,"cited_by_count":145},{"year":2025,"cited_by_count":582},{"year":2024,"cited_by_count":163},{"year":2023,"cited_by_count":9}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-10-10T00:00:00"}
