{"id":"https://openalex.org/W7123868952","doi":"https://doi.org/10.1145/3772052.3772215","title":"Oneiros: KV Cache Optimization through Parameter Remapping for Multi-tenant LLM Serving","display_name":"Oneiros: KV Cache Optimization through Parameter Remapping for Multi-tenant LLM Serving","publication_year":2025,"publication_date":"2025-11-19","ids":{"openalex":"https://openalex.org/W7123868952","doi":"https://doi.org/10.1145/3772052.3772215"},"language":null,"primary_location":{"id":"doi:10.1145/3772052.3772215","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772052.3772215","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3772052.3772215","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101477549","display_name":"Ruihao Li","orcid":"https://orcid.org/0000-0002-7092-2401"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ruihao Li","raw_affiliation_strings":["The University of Texas at Austin, Austin, USA"],"raw_orcid":"https://orcid.org/0000-0002-7092-2401","affiliations":[{"raw_affiliation_string":"The University of Texas at Austin, Austin, USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041279256","display_name":"Shagnik Pal","orcid":"https://orcid.org/0009-0007-6509-4019"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shagnik Pal","raw_affiliation_strings":["The University of Texas at Austin, Austin, USA"],"raw_orcid":"https://orcid.org/0009-0007-6509-4019","affiliations":[{"raw_affiliation_string":"The University of Texas at Austin, Austin, USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092835393","display_name":"Vineeth Narayan Pullu","orcid":null},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vineeth Narayan Pullu","raw_affiliation_strings":["The University of Texas at Austin, Austin, USA"],"raw_orcid":"https://orcid.org/0009-0003-0422-2516","affiliations":[{"raw_affiliation_string":"The University of Texas at Austin, Austin, USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113099539","display_name":"Prasoon Sinha","orcid":"https://orcid.org/0000-0002-5538-8829"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Prasoon Sinha","raw_affiliation_strings":["The University of Texas at Austin, Austin, USA"],"raw_orcid":"https://orcid.org/0000-0002-5538-8829","affiliations":[{"raw_affiliation_string":"The University of Texas at Austin, Austin, USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111062876","display_name":"Jeeho Ryoo","orcid":null},"institutions":[{"id":"https://openalex.org/I4210140396","display_name":"Becton Dickinson (Canada)","ror":"https://ror.org/031dsww89","country_code":"CA","type":"company","lineage":["https://openalex.org/I146461966","https://openalex.org/I4210140396"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Jeeho Ryoo","raw_affiliation_strings":["Fairleigh Dickinson University, Vancouver, Canada"],"raw_orcid":"https://orcid.org/0009-0003-0401-3685","affiliations":[{"raw_affiliation_string":"Fairleigh Dickinson University, Vancouver, Canada","institution_ids":["https://openalex.org/I4210140396"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068885069","display_name":"Lizy K. John","orcid":"https://orcid.org/0000-0002-8747-5214"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lizy K. John","raw_affiliation_strings":["The University of Texas at Austin, Austin, USA"],"raw_orcid":"https://orcid.org/0000-0002-8747-5214","affiliations":[{"raw_affiliation_string":"The University of Texas at Austin, Austin, USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5031510918","display_name":"Neeraja J. Yadwadkar","orcid":"https://orcid.org/0009-0007-7556-3069"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Neeraja J. Yadwadkar","raw_affiliation_strings":["The University of Texas at Austin, Austin, USA"],"raw_orcid":"https://orcid.org/0009-0007-7556-3069","affiliations":[{"raw_affiliation_string":"The University of Texas at Austin, Austin, USA","institution_ids":["https://openalex.org/I86519309"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.7127,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.88886629,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"88","last_page":"101"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8583999872207642,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8583999872207642,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.037300001829862595,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.024299999698996544,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7904000282287598},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.48989999294281006},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4659999907016754},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.4244000017642975},{"id":"https://openalex.org/keywords/cache-coloring","display_name":"Cache coloring","score":0.41830000281333923},{"id":"https://openalex.org/keywords/central-processing-unit","display_name":"Central processing unit","score":0.4108000099658966},{"id":"https://openalex.org/keywords/cache-pollution","display_name":"Cache pollution","score":0.39559999108314514},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.3736000061035156}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7957000136375427},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7904000282287598},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.664900004863739},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.48989999294281006},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4659999907016754},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.4244000017642975},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.41830000281333923},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.4108000099658966},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.39559999108314514},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3919999897480011},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.3736000061035156},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3691999912261963},{"id":"https://openalex.org/C59687516","wikidata":"https://www.wikidata.org/wiki/Q5015938","display_name":"Cache-oblivious algorithm","level":5,"score":0.35249999165534973},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.34299999475479126},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3327000141143799},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.32589998841285706},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.314300000667572},{"id":"https://openalex.org/C167713795","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"Smart Cache","level":5,"score":0.30550000071525574},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.3018999993801117},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2915000021457672},{"id":"https://openalex.org/C133371097","wikidata":"https://www.wikidata.org/wiki/Q868014","display_name":"Non-uniform memory access","level":5,"score":0.2831000089645386},{"id":"https://openalex.org/C165005293","wikidata":"https://www.wikidata.org/wiki/Q1074500","display_name":"Chip","level":2,"score":0.27970001101493835},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.27250000834465027},{"id":"https://openalex.org/C36340418","wikidata":"https://www.wikidata.org/wiki/Q7124288","display_name":"Page cache","level":5,"score":0.2574999928474426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3772052.3772215","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772052.3772215","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3772052.3772215","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772052.3772215","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4333840493","display_name":null,"funder_award_id":"2326894, 2425655","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1941428696","https://openalex.org/W2761028280","https://openalex.org/W2764100055","https://openalex.org/W2808513690","https://openalex.org/W2883242673","https://openalex.org/W2908349016","https://openalex.org/W3012479151","https://openalex.org/W3012514909","https://openalex.org/W3015423804","https://openalex.org/W3081168214","https://openalex.org/W3097411828","https://openalex.org/W3129831491","https://openalex.org/W3205803342","https://openalex.org/W4313229743","https://openalex.org/W4387321091","https://openalex.org/W4394892775","https://openalex.org/W4394944658","https://openalex.org/W4394998727","https://openalex.org/W4401176373","https://openalex.org/W4401211704","https://openalex.org/W4403203873","https://openalex.org/W4404954819","https://openalex.org/W4405756372","https://openalex.org/W4407217670"],"related_works":[],"abstract_inverted_index":{"KV":[0,17,32,68,78,93],"cache":[1,33,42,79],"accelerates":[2],"LLM":[3],"inference":[4],"by":[5,81,126],"avoiding":[6],"redundant":[7],"computation,":[8],"at":[9,172],"the":[10,41,65,86,105,109,112,121,127,132],"expense":[11],"of":[12,111,148,168],"memory.":[13,38],"To":[14],"support":[15],"larger":[16],"caches,":[18],"prior":[19],"work":[20],"extends":[21],"GPU":[22,35],"memory":[23,26,50,87,106],"with":[24],"CPU":[25,37,49],"via":[27],"CPU-offloading.":[28],"This":[29,95],"involves":[30],"swapping":[31,46,80],"between":[34],"and":[36,83,159],"However,":[39],"because":[40],"updates":[43],"dynamically,":[44],"such":[45,130],"incurs":[47],"high":[48,122],"traffic.":[51],"We":[52],"make":[53],"a":[54,146],"key":[55],"observation":[56],"that":[57,139],"model":[58,90],"parameters":[59,91,110],"remain":[60],"constant":[61],"during":[62],"runtime,":[63],"unlike":[64],"dynamically":[66],"updated":[67],"cache.":[69,94],"Building":[70],"on":[71],"this,":[72],"we":[73,137],"introduce":[74],"Oneiros,":[75],"which":[76],"avoids":[77],"remapping,":[82],"thereby":[84],"repurposing,":[85],"allocated":[88],"to":[89,164],"for":[92,108],"parameter":[96],"remapping":[97],"is":[98,170],"especially":[99],"beneficial":[100],"in":[101,150,155],"multi-tenant":[102],"environments,":[103],"where":[104],"used":[107],"inactive":[113],"models":[114],"can":[115],"be":[116],"more":[117],"aggressively":[118],"reclaimed.":[119],"Exploiting":[120],"CPU-GPU":[123],"bandwidth":[124],"offered":[125],"modern":[128],"hardware,":[129],"as":[131],"NVIDIA":[133],"Grace":[134],"Hopper":[135],"Superchip,":[136],"show":[138],"Oneiros":[140,169],"significantly":[141],"outperforms":[142],"state-of-the-art":[143],"solutions,":[144],"achieving":[145],"reduction":[147],"44.8%-82.5%":[149],"tail":[151,156],"time-between-token":[152],"latency,":[153,158],"20.7%-99.3%":[154],"time-to-first-token":[157],"6.6%-86.7%":[160],"higher":[161],"throughput":[162],"compared":[163],"vLLM.":[165],"Source":[166],"code":[167],"available":[171],"https://github.com/UT-SysML/Oneiros/.":[173]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-12T08:23:45.883708","created_date":"2026-01-14T00:00:00"}
