{"id":"https://openalex.org/W4414898386","doi":"https://doi.org/10.1109/icdcs63083.2025.00062","title":"MCaM : Efficient LLM Inference with Multi-tier KV Cache Management","display_name":"MCaM : Efficient LLM Inference with Multi-tier KV Cache Management","publication_year":2025,"publication_date":"2025-07-21","ids":{"openalex":"https://openalex.org/W4414898386","doi":"https://doi.org/10.1109/icdcs63083.2025.00062"},"language":"en","primary_location":{"id":"doi:10.1109/icdcs63083.2025.00062","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icdcs63083.2025.00062","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 45th International Conference on Distributed Computing Systems (ICDCS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Kexin Chu","orcid":null},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Kexin Chu","raw_affiliation_strings":["University of Connecticut,School of Computing"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,School of Computing","institution_ids":["https://openalex.org/I140172145"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035994989","display_name":"Zixu Shen","orcid":null},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zixu Shen","raw_affiliation_strings":["University of Connecticut,School of Computing"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,School of Computing","institution_ids":["https://openalex.org/I140172145"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Sheng-Ru Cheng","orcid":null},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sheng-Ru Cheng","raw_affiliation_strings":["University of Connecticut,School of Computing"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,School of Computing","institution_ids":["https://openalex.org/I140172145"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035075306","display_name":"Dawei Xiang","orcid":"https://orcid.org/0000-0001-6151-8522"},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dawei Xiang","raw_affiliation_strings":["University of Connecticut,School of Computing"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,School of Computing","institution_ids":["https://openalex.org/I140172145"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102913730","display_name":"Ziqin Liu","orcid":"https://orcid.org/0000-0003-3310-4861"},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ziqin Liu","raw_affiliation_strings":["University of Connecticut,School of Computing"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,School of Computing","institution_ids":["https://openalex.org/I140172145"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100577526","display_name":"Wei Zhang","orcid":"https://orcid.org/0009-0004-9512-4192"},"institutions":[{"id":"https://openalex.org/I140172145","display_name":"University of Connecticut","ror":"https://ror.org/02der9h97","country_code":"US","type":"education","lineage":["https://openalex.org/I140172145"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wei Zhang","raw_affiliation_strings":["University of Connecticut,School of Computing"],"affiliations":[{"raw_affiliation_string":"University of Connecticut,School of Computing","institution_ids":["https://openalex.org/I140172145"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I140172145"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31300096,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"571","last_page":"581"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9898999929428101,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.8398000001907349},{"id":"https://openalex.org/keywords/smart-cache","display_name":"Smart Cache","score":0.5619000196456909},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.5400000214576721},{"id":"https://openalex.org/keywords/cache-invalidation","display_name":"Cache invalidation","score":0.42989999055862427},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.41269999742507935},{"id":"https://openalex.org/keywords/cache-pollution","display_name":"Cache pollution","score":0.382999986410141},{"id":"https://openalex.org/keywords/cache-coloring","display_name":"Cache coloring","score":0.3668000102043152},{"id":"https://openalex.org/keywords/page-cache","display_name":"Page cache","score":0.3443000018596649}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8485999703407288},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.8398000001907349},{"id":"https://openalex.org/C167713795","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"Smart Cache","level":5,"score":0.5619000196456909},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.5400000214576721},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4580000042915344},{"id":"https://openalex.org/C25536678","wikidata":"https://www.wikidata.org/wiki/Q5015977","display_name":"Cache invalidation","level":5,"score":0.42989999055862427},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.41269999742507935},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.382999986410141},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.3668000102043152},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.36550000309944153},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.35109999775886536},{"id":"https://openalex.org/C36340418","wikidata":"https://www.wikidata.org/wiki/Q7124288","display_name":"Page cache","level":5,"score":0.3443000018596649},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.3418999910354614},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.33390000462532043},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.3280999958515167},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.32420000433921814},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.3183000087738037},{"id":"https://openalex.org/C51185590","wikidata":"https://www.wikidata.org/wiki/Q1017228","display_name":"Bus sniffing","level":5,"score":0.31029999256134033},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.28929999470710754},{"id":"https://openalex.org/C120936851","wikidata":"https://www.wikidata.org/wiki/Q1408065","display_name":"MESI protocol","level":5,"score":0.28850001096725464},{"id":"https://openalex.org/C7366592","wikidata":"https://www.wikidata.org/wiki/Q1255620","display_name":"Dram","level":2,"score":0.2874999940395355},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.2662000060081482},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icdcs63083.2025.00062","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icdcs63083.2025.00062","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 45th International Conference on Distributed Computing Systems (ICDCS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W3085139254","https://openalex.org/W4281850905","https://openalex.org/W4321636575","https://openalex.org/W4360831803","https://openalex.org/W4366341968","https://openalex.org/W4387321091","https://openalex.org/W4393145094","https://openalex.org/W4394892775","https://openalex.org/W4394945037","https://openalex.org/W4401176373","https://openalex.org/W4402670433","https://openalex.org/W4404401017","https://openalex.org/W4404401018","https://openalex.org/W4404781668","https://openalex.org/W4407217670","https://openalex.org/W4408848702","https://openalex.org/W4409363823","https://openalex.org/W4409963643"],"related_works":[],"abstract_inverted_index":{"The":[0],"KV":[1,37,71,86,103,120,160,178],"cache":[2,38,72,81,87,104,121,161,179],"in":[3,28,162,203],"current":[4],"LLM":[5,233],"serving":[6,52,64],"system":[7,82],"is":[8,19,25,78],"primarily":[9],"used":[10],"to":[11,125,156,201,216,237,243],"accelerate":[12],"processing":[13],"within":[14],"a":[15,79,141,152,186],"single":[16],"request":[17,68,127,240],"and":[18,33,50,70,89,123,129,147,218],"aggressively":[20],"deleted":[21],"once":[22],"the":[23,36,67,85,102,119,136,159,176,204,229],"response":[24],"generated.":[26],"However,":[27],"scenarios":[29],"like":[30],"virtual":[31],"assistants":[32],"multi-turn":[34],"conversations,":[35],"can":[39,45,211,226],"be":[40],"reused":[41],"across":[42,91,132],"requests,":[43],"which":[44],"dramatically":[46],"reduce":[47,212,228],"computation":[48],"costs":[49],"improve":[51,219],"latency.":[53],"Caching":[54],"historical":[55,106],"tokens,":[56],"however,":[57],"significantly":[58],"increases":[59],"memory":[60,99],"requirements.":[61],"Furthermore,":[62],"existing":[63],"systems":[65],"treat":[66],"scheduler":[69,124],"separately,":[73],"despite":[74],"their":[75],"tight":[76],"coupling.MCaM":[77],"multi-tier":[80],"that":[83,144,196,209],"enables":[84],"reuse":[88],"sharing":[90],"requests.":[92],"It":[93,225],"leverages":[94],"DRAM":[95,202],"as":[96],"slow-":[97],"tier":[98],"for":[100],"storing":[101],"of":[105,232],"prompts.":[107],"To":[108,134,181],"efficiently":[109],"utilize":[110],"fast-tier":[111],"Hign":[112],"Bandwidth":[113],"Memory(HBM)":[114],"on":[115],"GPU,":[116],"we":[117,190],"co-designed":[118],"manager":[122],"coordinate":[126],"scheduling":[128],"token":[130],"placement":[131],"tiers.":[133],"hide":[135],"reload":[137],"time,":[138],"MCaM":[139,150,210],"employs":[140],"pipeline":[142],"prefetcher":[143],"overlaps":[145],"communication":[146],"computation.":[148],"Additionally,":[149],"incorporates":[151],"quality-aware":[153],"sparsification":[154],"algorithm":[155],"heterogeneously":[157],"compress":[158],"each":[163],"layer.":[164],"This":[165],"approach":[166],"not":[167],"only":[168],"reduces":[169],"data":[170,183,198],"transfer":[171],"size":[172],"but":[173],"also":[174,227],"decreases":[175],"overall":[177],"size.":[180],"remove":[182],"offloading":[184],"from":[185,199],"request\u2019s":[187],"critical":[188],"path,":[189],"designed":[191],"an":[192],"asynchronous":[193],"offload":[194],"engine":[195],"swaps":[197],"HBM":[200],"background.":[205],"Our":[206],"experiments":[207],"show":[208],"TTFT":[213],"by":[214,223,235],"up":[215,236],"69%":[217],"prompt":[220],"prefilling":[221],"throughput":[222],"3.3X.":[224],"end-to-end":[230],"latency":[231],"inference":[234],"58%":[238],"when":[239],"length":[241],"increase":[242],"4096":[244],"tokens.":[245]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
