{"id":"https://openalex.org/W7125772905","doi":"https://doi.org/10.1007/s40747-025-02200-4","title":"Multi-tier dynamic storage of KV cache for LLM inference under resource-constrained conditions","display_name":"Multi-tier dynamic storage of KV cache for LLM inference under resource-constrained conditions","publication_year":2026,"publication_date":"2026-01-27","ids":{"openalex":"https://openalex.org/W7125772905","doi":"https://doi.org/10.1007/s40747-025-02200-4"},"language":"en","primary_location":{"id":"doi:10.1007/s40747-025-02200-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s40747-025-02200-4","pdf_url":null,"source":{"id":"https://openalex.org/S3035462843","display_name":"Complex & Intelligent Systems","issn_l":"2198-6053","issn":["2198-6053","2199-4536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Complex &amp; Intelligent Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1007/s40747-025-02200-4","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123963519","display_name":"Junliang Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Junliang Wang","raw_affiliation_strings":["China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China"],"raw_orcid":"https://orcid.org/0009-0002-0248-008X","affiliations":[{"raw_affiliation_string":"China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China","institution_ids":["https://openalex.org/I4210136246"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124032926","display_name":"Jiaqi Hu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiaqi Hu","raw_affiliation_strings":["China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China","institution_ids":["https://openalex.org/I4210136246"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123890486","display_name":"Qingping Cao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qingping Cao","raw_affiliation_strings":["China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China","institution_ids":["https://openalex.org/I4210136246"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110440283","display_name":"Yuanrui Zhu","orcid":"https://orcid.org/0000-0003-2688-1419"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuanrui Zhu","raw_affiliation_strings":["China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China","institution_ids":["https://openalex.org/I4210136246"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5077719982","display_name":"Xiancheng Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiancheng Lin","raw_affiliation_strings":["China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"China Telecom Research Institute, China Telecom Co.,Ltd., Guangzhou, 510660, China","institution_ids":["https://openalex.org/I4210136246"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5123963519"],"corresponding_institution_ids":["https://openalex.org/I4210136246"],"apc_list":{"value":1320,"currency":"GBP","value_usd":1619},"apc_paid":{"value":1320,"currency":"GBP","value_usd":1619},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.13468691,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"12","issue":"3","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.1518000066280365,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.1518000066280365,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.12359999865293503,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.11460000276565552,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7193999886512756},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5432000160217285},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5400999784469604},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4474000036716461},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.3668999969959259},{"id":"https://openalex.org/keywords/computer-data-storage","display_name":"Computer data storage","score":0.3659000098705292},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.3411000072956085},{"id":"https://openalex.org/keywords/distributed-data-store","display_name":"Distributed data store","score":0.2971999943256378}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7867000102996826},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7193999886512756},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5432000160217285},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5400999784469604},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4474000036716461},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4212999939918518},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.3668999969959259},{"id":"https://openalex.org/C194739806","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Computer data storage","level":2,"score":0.3659000098705292},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.3411000072956085},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3370000123977661},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3077000081539154},{"id":"https://openalex.org/C24885549","wikidata":"https://www.wikidata.org/wiki/Q339678","display_name":"Distributed data store","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C121838778","wikidata":"https://www.wikidata.org/wiki/Q7619924","display_name":"Storage efficiency","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.2903999984264374},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2802000045776367},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.27480000257492065},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.271699994802475},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C2778456923","wikidata":"https://www.wikidata.org/wiki/Q5337692","display_name":"Edge computing","level":3,"score":0.2669999897480011},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.25929999351501465},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.25760000944137573},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2567000091075897}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1007/s40747-025-02200-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s40747-025-02200-4","pdf_url":null,"source":{"id":"https://openalex.org/S3035462843","display_name":"Complex & Intelligent Systems","issn_l":"2198-6053","issn":["2198-6053","2199-4536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Complex &amp; Intelligent Systems","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:e0138b334e414ec1aa6c9a8ddc97799e","is_oa":true,"landing_page_url":"https://doaj.org/article/e0138b334e414ec1aa6c9a8ddc97799e","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Complex & Intelligent Systems, Vol 12, Iss 3, Pp 1-17 (2026)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1007/s40747-025-02200-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s40747-025-02200-4","pdf_url":null,"source":{"id":"https://openalex.org/S3035462843","display_name":"Complex & Intelligent Systems","issn_l":"2198-6053","issn":["2198-6053","2199-4536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Complex &amp; Intelligent Systems","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W3205594649","https://openalex.org/W4205800966","https://openalex.org/W4229482837","https://openalex.org/W4387321091","https://openalex.org/W4387968003","https://openalex.org/W4388874804","https://openalex.org/W4392188863","https://openalex.org/W4395020691","https://openalex.org/W4395073435","https://openalex.org/W4401176373","https://openalex.org/W4401671778","https://openalex.org/W4402671799","https://openalex.org/W4403421327","https://openalex.org/W4404401018","https://openalex.org/W4405934565","https://openalex.org/W4407881031","https://openalex.org/W4408411073","https://openalex.org/W4408738433","https://openalex.org/W4408895130","https://openalex.org/W4409248688","https://openalex.org/W4409262447","https://openalex.org/W4409282707","https://openalex.org/W4410128284","https://openalex.org/W4410226886","https://openalex.org/W4411146975","https://openalex.org/W4412377078","https://openalex.org/W4412610757","https://openalex.org/W4413360541","https://openalex.org/W4415505959"],"related_works":[],"abstract_inverted_index":{"The":[0],"scale":[1],"of":[2,128],"large":[3,20],"language":[4],"models":[5,21],"(LLMs)":[6],"continues":[7],"to":[8,12,79,124,181],"grow":[9],"in":[10,33,50,106,151],"response":[11],"increasing":[13],"demands":[14],"for":[15],"intelligent":[16],"applications.":[17],"When":[18],"these":[19],"and":[22,46,54,88,118,131,147,171],"their":[23],"intermediate":[24],"results,":[25],"such":[26],"as":[27],"key-value":[28],"(KV)":[29],"caches,":[30],"are":[31],"deployed":[32],"resource-constrained":[34],"environments":[35],"like":[36],"edge":[37,152],"inference":[38,107,153,162],"scenarios,":[39],"they":[40],"impose":[41],"substantial":[42],"pressure":[43],"on":[44,91],"computational":[45],"storage":[47,55,82,114,137,175],"resources,":[48],"resulting":[49],"significant":[51],"performance":[52,145],"degradation":[53],"inefficiency.":[56],"To":[57],"address":[58,125],"the":[59,92,126],"problem,":[60],"this":[61],"paper":[62],"proposes":[63],"a":[64,80,96,112],"novel":[65],"Multi-Tier":[66],"Dynamic":[67],"Storage":[68],"(MTDS)":[69],"framework":[70],"that":[71,158],"offloads":[72],"KV":[73,98],"caches":[74],"from":[75],"limited":[76,139],"GPU":[77],"VRAM":[78],"hierarchical":[81,121],"system,":[83],"effectively":[84],"reducing":[85],"both":[86],"memory":[87],"computation":[89],"overhead":[90,133],"GPU.":[93],"By":[94],"introducing":[95],"selective":[97],"cache":[99,176],"reuse":[100],"mechanism,":[101],"MTDS":[102,159],"achieves":[103],"notable":[104],"improvements":[105],"performance.":[108],"We":[109],"further":[110],"develop":[111],"dynamic":[113],"access":[115],"control":[116],"scheme":[117],"an":[119],"adaptive":[120],"eviction":[122],"strategy":[123],"challenges":[127],"bandwidth":[129],"contention":[130],"capacity":[132],"introduced":[134],"by":[135,167,179],"multi-tier":[136,173],"under":[138],"resources.":[140],"These":[141],"techniques":[142],"significantly":[143],"alleviate":[144],"bottlenecks":[146],"reduce":[148],"resource":[149],"waste":[150],"servers.":[154],"Experimental":[155],"results":[156],"demonstrate":[157],"improves":[160],"LLM":[161],"efficiency,":[163],"reduces":[164],"first-token":[165],"latency":[166],"more":[168],"than":[169],"25%,":[170],"increases":[172],"active":[174],"hit":[177],"rate":[178],"up":[180],"20%.":[182]},"counts_by_year":[],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2026-01-28T00:00:00"}
