{"id":"https://openalex.org/W7084047829","doi":"https://doi.org/10.1109/infocom55648.2025.11044599","title":"Online Context Caching for Distributed Large Language Models Serving","display_name":"Online Context Caching for Distributed Large Language Models Serving","publication_year":2025,"publication_date":"2025-05-19","ids":{"openalex":"https://openalex.org/W7084047829","doi":"https://doi.org/10.1109/infocom55648.2025.11044599"},"language":"en","primary_location":{"id":"doi:10.1109/infocom55648.2025.11044599","is_oa":false,"landing_page_url":"https://doi.org/10.1109/infocom55648.2025.11044599","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2025 - IEEE Conference on Computer Communications","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Bin Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Bin Gao","raw_affiliation_strings":["National University of Singapore,Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore,Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhuomin He","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhuomin He","raw_affiliation_strings":["Shanghai Jiao Tong University,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yizhen Yao","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yizhen Yao","raw_affiliation_strings":["Shanghai Jiao Tong University,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhanzhi Lou Lou","orcid":null},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Zhanzhi Lou Lou","raw_affiliation_strings":["National University of Singapore,Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore,Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhi Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhi Zhou","raw_affiliation_strings":["Sun Yat-sen University,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":null,"display_name":"Weng-Fai Wong","orcid":null},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Weng-Fai Wong","raw_affiliation_strings":["National University of Singapore,Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore,Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I165932596"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.59927163,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10336","display_name":"Cancer Cells and Metastasis","score":0.9850999712944031,"subfield":{"id":"https://openalex.org/subfields/2730","display_name":"Oncology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T10336","display_name":"Cancer Cells and Metastasis","score":0.9850999712944031,"subfield":{"id":"https://openalex.org/subfields/2730","display_name":"Oncology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T12009","display_name":"Hedgehog Signaling Pathway Studies","score":0.0027000000700354576,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10929","display_name":"Wnt/\u03b2-catenin signaling in development and cancer","score":0.002199999988079071,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6355000138282776},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.6197999715805054},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5946999788284302},{"id":"https://openalex.org/keywords/online-algorithm","display_name":"Online algorithm","score":0.48649999499320984},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.41290000081062317},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3937999904155731},{"id":"https://openalex.org/keywords/rounding","display_name":"Rounding","score":0.3772999942302704},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.35659998655319214}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.883899986743927},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6355000138282776},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.6197999715805054},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5946999788284302},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5012000203132629},{"id":"https://openalex.org/C196921405","wikidata":"https://www.wikidata.org/wiki/Q786431","display_name":"Online algorithm","level":2,"score":0.48649999499320984},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.41290000081062317},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3937999904155731},{"id":"https://openalex.org/C136625980","wikidata":"https://www.wikidata.org/wiki/Q663208","display_name":"Rounding","level":2,"score":0.3772999942302704},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.35659998655319214},{"id":"https://openalex.org/C102408133","wikidata":"https://www.wikidata.org/wiki/Q5156350","display_name":"Competitive analysis","level":3,"score":0.319599986076355},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.3003999888896942},{"id":"https://openalex.org/C5165142","wikidata":"https://www.wikidata.org/wiki/Q5432732","display_name":"False sharing","level":5,"score":0.29269999265670776},{"id":"https://openalex.org/C311688","wikidata":"https://www.wikidata.org/wiki/Q2393193","display_name":"Time complexity","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.26109999418258667},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.25929999351501465},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.25600001215934753},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.25600001215934753},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/infocom55648.2025.11044599","is_oa":false,"landing_page_url":"https://doi.org/10.1109/infocom55648.2025.11044599","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2025 - IEEE Conference on Computer Communications","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.40170907974243164,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"based":[4],"on":[5],"transformer":[6],"architectures":[7],"have":[8],"demonstrated":[9],"exceptional":[10],"performance":[11,172],"across":[12,81,130],"various":[13],"generative":[14],"tasks.":[15],"However,":[16],"the":[17,134,159],"significant":[18,171],"GPU":[19],"resources":[20],"required":[21],"for":[22,28],"LLM":[23,71,167],"inference":[24,57,128],"pose":[25],"financial":[26],"challenges":[27,110],"large-scale":[29],"deployment.":[30],"Context":[31],"caching":[32,68,79],"has":[33],"been":[34],"proposed":[35],"to":[36,55,91,101,126],"enhance":[37],"cost-efficiency":[38],"by":[39,111],"storing":[40],"intermediate":[41],"key":[42],"and":[43,123,148],"value":[44],"(KV)":[45],"pairs":[46],"in":[47,64,69,164],"cost-effective":[48],"storage":[49],"mediums,":[50],"which":[51],"can":[52],"be":[53],"reused":[54],"accelerate":[56],"when":[58],"requests":[59],"share":[60],"prefixes.":[61],"While":[62],"promising":[63],"single-instance":[65],"applications,":[66],"context":[67,78],"distributed":[70,166],"serving":[72,168],"systems":[73],"introduces":[74],"unique":[75],"challenges.":[76],"Firstly,":[77],"decisions":[80],"time":[82,131],"slots":[83],"are":[84],"inter-dependent,":[85],"affecting":[86],"overall":[87],"system":[88,169],"efficiency":[89],"due":[90],"potential":[92],"cache":[93,121],"misses.":[94],"Secondly,":[95],"request":[96,124],"scheduling":[97,125],"complexities":[98],"arise,":[99],"leading":[100],"load":[102],"balancing":[103],"issues":[104],"among":[105],"instances.":[106],"We":[107],"address":[108],"these":[109],"formulating":[112],"an":[113],"online":[114],"optimization":[115],"problem":[116],"that":[117],"jointly":[118],"decides":[119],"KV":[120],"placement":[122],"minimize":[127],"costs":[129],"slots.":[132],"Given":[133],"NP-hard":[135],"nature":[136],"of":[137],"this":[138],"problem,":[139],"we":[140],"propose":[141],"a":[142,155,165],"framework":[143],"leveraging":[144],"regularization,":[145],"linear":[146],"relaxation,":[147],"randomized":[149],"rounding":[150],"techniques.":[151],"Our":[152],"solution":[153],"achieves":[154],"competitive":[156],"ratio":[157],"near":[158],"offline":[160],"optimum.":[161],"Experimental":[162],"results":[163],"demonstrate":[170],"improvements":[173],"over":[174],"baseline":[175],"methods.":[176]},"counts_by_year":[],"updated_date":"2025-12-28T23:10:05.387466","created_date":"2025-10-10T00:00:00"}
