{"id":"https://openalex.org/W4411403202","doi":"https://doi.org/10.1145/3725394","title":"Apt-Serve: Adaptive Request Scheduling on Hybrid Cache for Scalable LLM Inference Serving","display_name":"Apt-Serve: Adaptive Request Scheduling on Hybrid Cache for Scalable LLM Inference Serving","publication_year":2025,"publication_date":"2025-06-17","ids":{"openalex":"https://openalex.org/W4411403202","doi":"https://doi.org/10.1145/3725394"},"language":"en","primary_location":{"id":"doi:10.1145/3725394","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3725394","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2504.07494","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102938652","display_name":"Shihong Gao","orcid":"https://orcid.org/0000-0002-0413-9005"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Shihong Gao","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong SAR, China"],"raw_orcid":"https://orcid.org/0000-0002-0413-9005","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100327368","display_name":"Xin Zhang","orcid":"https://orcid.org/0000-0001-8560-5006"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Xin Zhang","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong SAR, China"],"raw_orcid":"https://orcid.org/0000-0001-8560-5006","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053338416","display_name":"Yanyan Shen","orcid":"https://orcid.org/0000-0001-8364-3674"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanyan Shen","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-8364-3674","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100333516","display_name":"Lei Chen","orcid":"https://orcid.org/0000-0002-8257-5806"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lei Chen","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China and The Hong Kong University of Science and Technology, Hong Kong SAR, China"],"raw_orcid":"https://orcid.org/0000-0002-8257-5806","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China and The Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5102938652"],"corresponding_institution_ids":["https://openalex.org/I200769079"],"apc_list":null,"apc_paid":null,"fwci":1.2842,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.81238853,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":"3","issue":"3","first_page":"1","last_page":"28"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9896000027656555,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8986129760742188},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.725520670413971},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5866495966911316},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5819094777107239},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5673402547836304},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.553617000579834},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5359464883804321},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.48082244396209717},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.46696287393569946},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.44822460412979126},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.44700244069099426},{"id":"https://openalex.org/keywords/cache-pollution","display_name":"Cache pollution","score":0.44275978207588196},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.2604902982711792},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.20370444655418396},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.18275701999664307}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8986129760742188},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.725520670413971},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5866495966911316},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5819094777107239},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5673402547836304},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.553617000579834},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5359464883804321},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.48082244396209717},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.46696287393569946},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.44822460412979126},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.44700244069099426},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.44275978207588196},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2604902982711792},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.20370444655418396},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.18275701999664307},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3725394","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3725394","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2504.07494","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.07494","pdf_url":"https://arxiv.org/pdf/2504.07494","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-166261","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-166261","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2504.07494","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.07494","pdf_url":"https://arxiv.org/pdf/2504.07494","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1402809924","display_name":null,"funder_award_id":"2023YFF0725100","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G380298712","display_name":null,"funder_award_id":"U22B2060","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3952302843","display_name":null,"funder_award_id":"AoE/E-603/18","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8721642152","display_name":null,"funder_award_id":"2021SHZDZX0102","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8825964608","display_name":null,"funder_award_id":"2022YFE0200500","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320308943","display_name":"Microsoft Research","ror":"https://ror.org/00d0nc645"},{"id":"https://openalex.org/F4320316083","display_name":"Tencent","ror":"https://ror.org/00hhjss72"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322999","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04"},{"id":"https://openalex.org/F4320323537","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597"},{"id":"https://openalex.org/F4320333993","display_name":"Microsoft Research Asia","ror":"https://ror.org/0300m5276"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4411403202.pdf","grobid_xml":"https://content.openalex.org/works/W4411403202.grobid-xml"},"referenced_works_count":63,"referenced_works":["https://openalex.org/W1938691758","https://openalex.org/W2048659529","https://openalex.org/W2055098778","https://openalex.org/W2525739395","https://openalex.org/W2612690371","https://openalex.org/W2734941459","https://openalex.org/W2786278116","https://openalex.org/W3084654275","https://openalex.org/W3106772683","https://openalex.org/W3134267570","https://openalex.org/W3138303811","https://openalex.org/W3165484655","https://openalex.org/W3173761726","https://openalex.org/W3173850788","https://openalex.org/W3177263144","https://openalex.org/W4205983429","https://openalex.org/W4220938834","https://openalex.org/W4224308101","https://openalex.org/W4226328099","https://openalex.org/W4242353006","https://openalex.org/W4281758439","https://openalex.org/W4282576620","https://openalex.org/W4285356434","https://openalex.org/W4285451014","https://openalex.org/W4288070868","https://openalex.org/W4294904053","https://openalex.org/W4308760184","https://openalex.org/W4310282800","https://openalex.org/W4313050539","https://openalex.org/W4318823398","https://openalex.org/W4323343904","https://openalex.org/W4366492495","https://openalex.org/W4367672983","https://openalex.org/W4380366709","https://openalex.org/W4380433123","https://openalex.org/W4380433202","https://openalex.org/W4380433249","https://openalex.org/W4381328689","https://openalex.org/W4381329687","https://openalex.org/W4383749415","https://openalex.org/W4385270423","https://openalex.org/W4385567877","https://openalex.org/W4387321091","https://openalex.org/W4389576338","https://openalex.org/W4392207935","https://openalex.org/W4393183662","https://openalex.org/W4393183933","https://openalex.org/W4394998727","https://openalex.org/W4395020691","https://openalex.org/W4396571465","https://openalex.org/W4396601317","https://openalex.org/W4399175255","https://openalex.org/W4400909578","https://openalex.org/W4400910550","https://openalex.org/W4401176373","https://openalex.org/W4401211704","https://openalex.org/W4401352033","https://openalex.org/W4401856724","https://openalex.org/W4402042769","https://openalex.org/W4407356947","https://openalex.org/W4407357364","https://openalex.org/W6810081322","https://openalex.org/W6846659131"],"related_works":["https://openalex.org/W2031173804","https://openalex.org/W3085471909","https://openalex.org/W2114386333","https://openalex.org/W2363769136","https://openalex.org/W198173854","https://openalex.org/W2324141783","https://openalex.org/W2148571123","https://openalex.org/W2509523906","https://openalex.org/W2012518269","https://openalex.org/W2167303720"],"abstract_inverted_index":{"Large":[0],"language":[1],"model":[2],"(LLM)":[3],"inference":[4,114,208],"serving":[5,209],"systems":[6,23,44],"are":[7],"essential":[8],"to":[9,19,24,36,47,53,108,190,198,205],"various":[10],"LLM-based":[11],"applications.":[12],"As":[13],"demand":[14],"for":[15,132],"LLM":[16,113],"services":[17],"continues":[18],"grow,":[20],"scaling":[21],"these":[22],"handle":[25],"high":[26],"request":[27,144],"rates":[28],"while":[29],"meeting":[30],"latency":[31],"Service-Level":[32],"Objectives":[33],"(SLOs),":[34],"referred":[35],"as":[37],"effective":[38,49,110,202],"throughput,":[39,50],"becomes":[40],"critical.":[41],"However,":[42],"existing":[43],"often":[45],"struggle":[46],"improve":[48],"primarily":[51],"due":[52],"a":[54,104,118,128],"significant":[55],"decline":[56],"in":[57,112,201],"Time":[58],"To":[59],"First":[60],"Token":[61],"(TTFT)":[62],"SLO":[63],"attainment.":[64],"We":[65,163],"identify":[66],"two":[67],"major":[68],"causes":[69],"of":[70],"this":[71,99],"bottleneck:":[72],"(1)":[73],"memory-intensive":[74],"KV":[75,125],"cache":[76,121,126,131],"that":[77,123,158,194],"limits":[78],"batch":[79,89,140,161],"size":[80],"expansion":[81],"under":[82],"GPU":[83],"memory":[84],"constraints,":[85],"and":[86,142,171,185],"(2)":[87],"rigid":[88],"composition":[90],"enforced":[91],"by":[92],"the":[93,148,166,206],"default":[94],"First-Come-First-Serve":[95],"scheduling":[96,156,168],"policy.":[97],"In":[98],"paper,":[100],"we":[101],"introduce":[102],"Apt-Serve,":[103],"scalable":[105],"framework":[106],"designed":[107],"enhance":[109],"throughput":[111,203],"serving.":[115],"Apt-Serve":[116,151,195],"features":[117],"new":[119],"hybrid":[120,149],"scheme":[122],"combines":[124],"with":[127,176],"memory-efficient":[129],"hidden":[130,135],"reusable":[133],"input":[134],"state":[136],"vectors,":[137],"allowing":[138],"large":[139],"sizes":[141],"improving":[143],"concurrency.":[145],"Based":[146],"on":[147,181],"cache,":[150],"employs":[152],"an":[153,173],"adaptive":[154,167],"runtime":[155],"mechanism":[157],"dynamically":[159],"optimizes":[160],"composition.":[162],"formally":[164],"define":[165],"optimization":[169],"problem":[170],"propose":[172],"efficient":[174],"algorithm":[175],"theoretical":[177],"guarantees.":[178],"Extensive":[179],"evaluations":[180],"three":[182],"real-world":[183],"datasets":[184],"LLMs":[186],"ranging":[187],"from":[188],"13B":[189],"66B":[191],"parameters":[192],"demonstrate":[193],"achieves":[196],"up":[197],"8.8x":[199],"improvement":[200],"compared":[204],"state-of-the-art":[207],"systems.":[210]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-30T09:15:22.047038","created_date":"2025-10-10T00:00:00"}
