{"id":"https://openalex.org/W4415284284","doi":"https://doi.org/10.1109/lca.2025.3622724","title":"Accelerating LLM Inference via Dynamic KV Cache Placement in Heterogeneous Memory System","display_name":"Accelerating LLM Inference via Dynamic KV Cache Placement in Heterogeneous Memory System","publication_year":2025,"publication_date":"2025-07-01","ids":{"openalex":"https://openalex.org/W4415284284","doi":"https://doi.org/10.1109/lca.2025.3622724"},"language":null,"primary_location":{"id":"doi:10.1109/lca.2025.3622724","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3622724","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yunhua Fang","orcid":"https://orcid.org/0009-0009-4718-8825"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yunhua Fang","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0009-0009-4718-8825","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045139717","display_name":"Rui Xie","orcid":"https://orcid.org/0000-0003-3177-5071"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rui Xie","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0000-0003-3177-5071","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113362542","display_name":"Asad Ul Haq","orcid":null},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Asad Ul Haq","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0009-0003-7975-0102","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101209866","display_name":"Linsen Ma","orcid":"https://orcid.org/0009-0000-8535-7911"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Linsen Ma","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0009-0000-8535-7911","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034513925","display_name":"Kaoutar El Maghraoui","orcid":"https://orcid.org/0000-0002-1967-8749"},"institutions":[{"id":"https://openalex.org/I4210114115","display_name":"IBM Research - Thomas J. Watson Research Center","ror":"https://ror.org/0265w5591","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kaoutar El Maghraoui","raw_affiliation_strings":["IBM T.J. Watson Research Center, Yorktown Heights, NY, USA"],"raw_orcid":"https://orcid.org/0000-0002-1967-8749","affiliations":[{"raw_affiliation_string":"IBM T.J. Watson Research Center, Yorktown Heights, NY, USA","institution_ids":["https://openalex.org/I4210114115"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082043392","display_name":"Naigang Wang","orcid":"https://orcid.org/0000-0001-7664-0061"},"institutions":[{"id":"https://openalex.org/I4210114115","display_name":"IBM Research - Thomas J. Watson Research Center","ror":"https://ror.org/0265w5591","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Naigang Wang","raw_affiliation_strings":["IBM T.J. Watson Research Center, Yorktown Heights, NY, USA"],"raw_orcid":"https://orcid.org/0000-0001-7664-0061","affiliations":[{"raw_affiliation_string":"IBM T.J. Watson Research Center, Yorktown Heights, NY, USA","institution_ids":["https://openalex.org/I4210114115"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100377219","display_name":"Meng Wang","orcid":"https://orcid.org/0000-0003-0928-9691"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Meng Wang","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0000-0003-0928-9691","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100324294","display_name":"Liu Liu","orcid":"https://orcid.org/0000-0003-0792-8146"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Liu Liu","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0000-0003-0792-8146","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102023287","display_name":"Tong Zhang","orcid":"https://orcid.org/0009-0009-8005-0043"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tong Zhang","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0009-0009-8005-0043","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I165799507"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.33382406,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"24","issue":"2","first_page":"337","last_page":"340"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9861999750137329,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9861999750137329,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.983299970626831,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9822999835014343,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.694599986076355},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.5457000136375427},{"id":"https://openalex.org/keywords/non-uniform-memory-access","display_name":"Non-uniform memory access","score":0.45980000495910645},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.43790000677108765},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.43619999289512634},{"id":"https://openalex.org/keywords/cache-coloring","display_name":"Cache coloring","score":0.41839998960494995},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.37689998745918274},{"id":"https://openalex.org/keywords/cache-pollution","display_name":"Cache pollution","score":0.3483999967575073}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8830999732017517},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.694599986076355},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.5457000136375427},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4975000023841858},{"id":"https://openalex.org/C133371097","wikidata":"https://www.wikidata.org/wiki/Q868014","display_name":"Non-uniform memory access","level":5,"score":0.45980000495910645},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.43790000677108765},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.43619999289512634},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.41839998960494995},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.39750000834465027},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.37689998745918274},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.3483999967575073},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.34459999203681946},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3433000147342682},{"id":"https://openalex.org/C59687516","wikidata":"https://www.wikidata.org/wiki/Q5015938","display_name":"Cache-oblivious algorithm","level":5,"score":0.3303000032901764},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.3278000056743622},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.32749998569488525},{"id":"https://openalex.org/C3720319","wikidata":"https://www.wikidata.org/wiki/Q5015937","display_name":"Cache-only memory architecture","level":5,"score":0.32109999656677246},{"id":"https://openalex.org/C63511323","wikidata":"https://www.wikidata.org/wiki/Q908936","display_name":"Interleaved memory","level":4,"score":0.29760000109672546},{"id":"https://openalex.org/C118702147","wikidata":"https://www.wikidata.org/wiki/Q189396","display_name":"Dynamic random-access memory","level":3,"score":0.28610000014305115},{"id":"https://openalex.org/C41036726","wikidata":"https://www.wikidata.org/wiki/Q844824","display_name":"Physical address","level":3,"score":0.2849999964237213},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2635999917984009},{"id":"https://openalex.org/C51290061","wikidata":"https://www.wikidata.org/wiki/Q1936765","display_name":"Uniform memory access","level":4,"score":0.25859999656677246},{"id":"https://openalex.org/C57863822","wikidata":"https://www.wikidata.org/wiki/Q905488","display_name":"Flat memory model","level":4,"score":0.2556999921798706},{"id":"https://openalex.org/C107568181","wikidata":"https://www.wikidata.org/wiki/Q5319000","display_name":"Dynamic priority scheduling","level":3,"score":0.2554999887943268}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lca.2025.3622724","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3622724","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":4,"referenced_works":["https://openalex.org/W2024060531","https://openalex.org/W4385245566","https://openalex.org/W4393145114","https://openalex.org/W4402671766"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Model":[2],"(LLM)":[3],"inference":[4],"is":[5,128],"increasingly":[6],"constrained":[7],"by":[8],"memory":[9,27,68,76,140],"bandwidth,":[10],"with":[11,70],"frequent":[12],"access":[13],"to":[14,42,91],"the":[15,29,38,108,129],"key-value":[16],"(KV)":[17],"cache":[18,41,86,136],"dominating":[19],"data":[20],"movement.":[21],"While":[22],"attention":[23],"sparsity":[24],"reduces":[25],"some":[26],"traffic,":[28],"relevance":[30],"of":[31,133],"past":[32],"tokens":[33],"varies":[34],"over":[35],"time,":[36],"requiring":[37],"full":[39],"KV":[40,85,135],"remain":[43],"accessible":[44],"and":[45,51,60,112],"sustaining":[46],"pressure":[47],"on":[48],"both":[49],"bandwidth":[50,94],"capacity.":[52],"With":[53],"advances":[54],"in":[55,138],"interconnects":[56],"such":[57,89],"as":[58],"NVLink":[59],"LPDDR5X,":[61],"modern":[62],"AI":[63],"hardware":[64],"now":[65],"integrates":[66],"high-bandwidth":[67],"(HBM)":[69],"high-speed":[71],"off-package":[72],"DRAM,":[73],"making":[74],"heterogeneous":[75,139],"systems":[77,90,141],"a":[78,102,114],"practical":[79],"solution.":[80],"This":[81],"work":[82],"investigates":[83],"dynamic":[84,134],"placement":[87,109],"across":[88],"maximize":[92],"aggregated":[93],"utilization":[95],"under":[96],"capacity":[97],"constraints.":[98],"Rather":[99],"than":[100],"proposing":[101],"specific":[103],"scheduling":[104,137],"policy,":[105],"we":[106],"formulate":[107],"problem":[110],"mathematically":[111],"derive":[113],"theoretical":[115],"upper":[116],"bound,":[117],"revealing":[118],"substantial":[119],"headroom":[120],"for":[121,142],"runtime":[122],"optimization.":[123],"To":[124],"our":[125],"knowledge,":[126],"this":[127],"first":[130],"formal":[131],"treatment":[132],"LLM":[143],"inference.":[144]},"counts_by_year":[],"updated_date":"2025-11-06T23:17:08.748858","created_date":"2025-10-17T00:00:00"}
