{"id":"https://openalex.org/W7084128504","doi":"https://doi.org/10.1109/infocom55648.2025.11044533","title":"Mell: Memory-Efficient Large Language Model Serving via Multi-GPU KV Cache Management","display_name":"Mell: Memory-Efficient Large Language Model Serving via Multi-GPU KV Cache Management","publication_year":2025,"publication_date":"2025-05-19","ids":{"openalex":"https://openalex.org/W7084128504","doi":"https://doi.org/10.1109/infocom55648.2025.11044533"},"language":"en","primary_location":{"id":"doi:10.1109/infocom55648.2025.11044533","is_oa":false,"landing_page_url":"https://doi.org/10.1109/infocom55648.2025.11044533","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2025 - IEEE Conference on Computer Communications","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Qianli Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Qianli Liu","raw_affiliation_strings":["The Hong Kong University of Science and Technology,Department of Computer Science and Engineering,Hong Kong,China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology,Department of Computer Science and Engineering,Hong Kong,China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zicong Hong","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Zicong Hong","raw_affiliation_strings":["The Hong Kong University of Science and Technology,Department of Computer Science and Engineering,Hong Kong,China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology,Department of Computer Science and Engineering,Hong Kong,China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Peng Li","orcid":null},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Li","raw_affiliation_strings":["School of Cyber Science and Engineering, Xi&#x0027;an Jiaotong University,China"],"affiliations":[{"raw_affiliation_string":"School of Cyber Science and Engineering, Xi&#x0027;an Jiaotong University,China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Fahao Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I141591182","display_name":"University of Aizu","ror":"https://ror.org/02pg0e883","country_code":"JP","type":"education","lineage":["https://openalex.org/I141591182"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Fahao Chen","raw_affiliation_strings":["School of Computer Science and Engineering, University of Aizu,Japan"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Aizu,Japan","institution_ids":["https://openalex.org/I141591182"]}]},{"author_position":"last","author":{"id":null,"display_name":"Song Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Song Guo","raw_affiliation_strings":["The Hong Kong University of Science and Technology,Department of Computer Science and Engineering,Hong Kong,China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology,Department of Computer Science and Engineering,Hong Kong,China","institution_ids":["https://openalex.org/I200769079"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I200769079"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.54205607,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10647","display_name":"Coastal and Marine Dynamics","score":0.6577000021934509,"subfield":{"id":"https://openalex.org/subfields/1904","display_name":"Earth-Surface Processes"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10647","display_name":"Coastal and Marine Dynamics","score":0.6577000021934509,"subfield":{"id":"https://openalex.org/subfields/1904","display_name":"Earth-Surface Processes"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10930","display_name":"Flood Risk Assessment and Management","score":0.06689999997615814,"subfield":{"id":"https://openalex.org/subfields/2306","display_name":"Global and Planetary Change"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10779","display_name":"Coastal wetland ecosystem dynamics","score":0.03180000185966492,"subfield":{"id":"https://openalex.org/subfields/2303","display_name":"Ecology"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.8108000159263611},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.602400004863739},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5504999756813049},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.46050000190734863},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.44530001282691956},{"id":"https://openalex.org/keywords/smart-cache","display_name":"Smart Cache","score":0.4172999858856201},{"id":"https://openalex.org/keywords/cache-invalidation","display_name":"Cache invalidation","score":0.4146000146865845},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.39750000834465027}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8772000074386597},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.8108000159263611},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.602400004863739},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5504999756813049},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.46050000190734863},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.44530001282691956},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.43549999594688416},{"id":"https://openalex.org/C167713795","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"Smart Cache","level":5,"score":0.4172999858856201},{"id":"https://openalex.org/C25536678","wikidata":"https://www.wikidata.org/wiki/Q5015977","display_name":"Cache invalidation","level":5,"score":0.4146000146865845},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.41359999775886536},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.39750000834465027},{"id":"https://openalex.org/C133588205","wikidata":"https://www.wikidata.org/wiki/Q28455645","display_name":"Instruction prefetch","level":3,"score":0.3398999869823456},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3246000111103058},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.30889999866485596},{"id":"https://openalex.org/C85502023","wikidata":"https://www.wikidata.org/wiki/Q157171","display_name":"Renting","level":2,"score":0.3073999881744385},{"id":"https://openalex.org/C2778579508","wikidata":"https://www.wikidata.org/wiki/Q722192","display_name":"System call","level":2,"score":0.30649998784065247},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.30640000104904175},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2872999906539917},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.28189998865127563},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.2815000116825104},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C2780799671","wikidata":"https://www.wikidata.org/wiki/Q17087362","display_name":"Transient (computer programming)","level":2,"score":0.27090001106262207}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/infocom55648.2025.11044533","is_oa":false,"landing_page_url":"https://doi.org/10.1109/infocom55648.2025.11044533","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2025 - IEEE Conference on Computer Communications","raw_type":"proceedings-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-168713","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-168713","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Serving":[0],"large":[1],"language":[2],"models":[3],"(LLMs)":[4],"for":[5],"massive":[6],"users":[7],"is":[8],"challenged":[9],"by":[10,104,185,192],"the":[11,16,21,42,46,96,102,106,112,127,160,165,181,189],"significant":[12],"memory":[13],"footprint":[14],"of":[15,32,35,45,98,167,174,183],"transient":[17],"state,":[18],"known":[19],"as":[20],"key-value":[22],"(KV)":[23],"cache,":[24],"which":[25],"scales":[26],"with":[27,52,153],"sequence":[28],"length":[29],"and":[30,72,111,129,132,149,176,187],"number":[31,97,166,182],"requests.":[33],"Instead":[34],"renting":[36],"or":[37],"buying":[38],"more":[39,63],"expensive":[40],"GPUs,":[41,50],"load":[43,110],"imbalance":[44],"KV":[47,91,108],"cache":[48,92,109],"across":[49],"coupled":[51],"recent":[53],"advances":[54],"in":[55,101],"inter-GPU":[56],"communication,":[57],"provides":[58],"an":[59,120,141],"opportunity":[60],"to":[61,125,134,145,158,197],"serve":[62],"requests":[64],"via":[65,89],"request":[66,74,114,122,148],"migration.":[67,115],"However,":[68],"high":[69],"migration":[70,123,154],"overhead":[71],"unpredictable":[73],"patterns":[75],"make":[76],"it":[77,179],"challenging.":[78],"Therefore,":[79],"this":[80],"paper":[81],"proposes":[82],"Mell,":[83],"a":[84,146,172],"memory-efficient":[85],"LLM":[86,199],"serving":[87,200],"system":[88,103],"multi-GPU":[90,150],"management.":[93],"It":[94,156],"saves":[95],"GPUs":[99,162,184],"needed":[100],"considering":[105],"dynamic":[107],"costly":[113],"Specifically,":[116],"we":[117,139,170],"first":[118],"develop":[119],"adaptive":[121],"mechanism":[124],"balance":[126],"computational":[128],"communication":[130],"overheads":[131],"adapt":[133],"diverse":[135],"resource":[136],"conditions.":[137],"Then,":[138],"design":[140],"online":[142],"algorithm":[143],"tailored":[144],"multi-LLM":[147],"scheduling":[151],"problem":[152],"enabled.":[155],"aims":[157],"minimise":[159],"required":[161],"while":[163],"limiting":[164],"migrations.":[168],"Finally,":[169],"implement":[171],"prototype":[173],"Mell":[175],"demonstrate":[177],"that":[178],"reduces":[180],"31%":[186],"increases":[188],"GPU":[190],"utilization":[191],"43%":[193],"at":[194],"most":[195],"compared":[196],"existing":[198],"systems.":[201]},"counts_by_year":[],"updated_date":"2025-12-22T23:10:17.713674","created_date":"2025-10-10T00:00:00"}
