{"id":"https://openalex.org/W7127569307","doi":"https://doi.org/10.1109/ccnc65079.2026.11366259","title":"Toward Cost-Efficient LLM Serving: A System-Level Memory Optimization Approach","display_name":"Toward Cost-Efficient LLM Serving: A System-Level Memory Optimization Approach","publication_year":2026,"publication_date":"2026-01-09","ids":{"openalex":"https://openalex.org/W7127569307","doi":"https://doi.org/10.1109/ccnc65079.2026.11366259"},"language":null,"primary_location":{"id":"doi:10.1109/ccnc65079.2026.11366259","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ccnc65079.2026.11366259","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE 23rd Consumer Communications &amp;amp; Networking Conference (CCNC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5020438986","display_name":"Geunsik Lim","orcid":"https://orcid.org/0000-0003-1845-7132"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Geunsik Lim","raw_affiliation_strings":["Samsung Electronics,Seoul,South Korea"],"affiliations":[{"raw_affiliation_string":"Samsung Electronics,Seoul,South Korea","institution_ids":["https://openalex.org/I2250650973"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5020438986"],"corresponding_institution_ids":["https://openalex.org/I2250650973"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.19713891,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.2667999863624573,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.2667999863624573,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.15549999475479126,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.13760000467300415,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7027999758720398},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6080999970436096},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.48100000619888306},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.3880000114440918},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.34200000762939453},{"id":"https://openalex.org/keywords/resource-management","display_name":"Resource management (computing)","score":0.3361999988555908},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.33230000734329224},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.3292999863624573}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8363999724388123},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7027999758720398},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6080999970436096},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5062999725341797},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.48100000619888306},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.4047999978065491},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3880000114440918},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.36399999260902405},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.34200000762939453},{"id":"https://openalex.org/C2780609101","wikidata":"https://www.wikidata.org/wiki/Q17156588","display_name":"Resource management (computing)","level":2,"score":0.3361999988555908},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.33230000734329224},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.30630001425743103},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3027999997138977},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3003999888896942},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C2777958785","wikidata":"https://www.wikidata.org/wiki/Q17120940","display_name":"Resource efficiency","level":2,"score":0.29510000348091125},{"id":"https://openalex.org/C98025372","wikidata":"https://www.wikidata.org/wiki/Q477538","display_name":"Systems architecture","level":3,"score":0.2867000102996826},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.27889999747276306},{"id":"https://openalex.org/C2994168587","wikidata":"https://www.wikidata.org/wiki/Q5295","display_name":"Random access memory","level":2,"score":0.2761000096797943},{"id":"https://openalex.org/C177950962","wikidata":"https://www.wikidata.org/wiki/Q10997658","display_name":"Non-volatile memory","level":2,"score":0.2759000062942505},{"id":"https://openalex.org/C37724790","wikidata":"https://www.wikidata.org/wiki/Q210813","display_name":"Direct memory access","level":3,"score":0.2689000070095062},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.2551000118255615}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ccnc65079.2026.11366259","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ccnc65079.2026.11366259","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE 23rd Consumer Communications &amp;amp; Networking Conference (CCNC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.49738937616348267,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320332195","display_name":"Samsung","ror":"https://ror.org/04w3jy968"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W4321636575","https://openalex.org/W4390968835","https://openalex.org/W4400491578","https://openalex.org/W4406137434","https://openalex.org/W4412915583","https://openalex.org/W4414404571"],"related_works":[],"abstract_inverted_index":{"Serving":[0],"large-scale":[1],"language":[2],"models":[3],"(LLMs)":[4],"requires":[5],"significant":[6],"system":[7,13],"resources,":[8],"where":[9],"GPU":[10],"memory":[11,53],"limits,":[12],"bottlenecks,":[14],"and":[15,38,44,51,79,111,115,133,143,149],"I/O":[16],"delays":[17],"collectively":[18],"introduce":[19],"substantial":[20],"latency.":[21],"We":[22],"present":[23],"MBALL":[24,47,68,95,131],"(Memory":[25],"BALLooner),":[26],"a":[27,97],"lightweight":[28],"framework":[29],"that":[30,67,119],"jointly":[31],"optimizes":[32],"resource":[33],"allocation":[34],"across":[35,101],"VRAM,":[36],"DRAM,":[37],"NVMe":[39],"storage":[40],"to":[41,55,73],"enable":[42],"cost-efficient":[43],"sustainable":[45],"inference.":[46],"integrates":[48],"task-aware":[49],"scheduling":[50],"cooperative":[52],"management":[54],"mitigate":[56],"contention":[57],"in":[58,153],"real":[59,154],"deployments.":[60,155],"Experiments":[61],"with":[62,91],"state-of-the-art":[63],"LLM":[64,124],"workloads":[65],"show":[66],"improves":[69],"throughput":[70],"by":[71,77,83],"up":[72],"60%,":[74],"reduces":[75],"latency":[76],"35%,":[78],"lowers":[80],"VRAM":[81],"usage":[82],"15%;":[84],"results":[85],"represent":[86],"means":[87],"over":[88],"n=5":[89],"runs":[90],"95%":[92],"confidence":[93],"intervals.":[94],"provides":[96],"flexible":[98],"design":[99],"applicable":[100],"diverse":[102],"industrial":[103],"domains,":[104],"including":[105],"smart":[106],"logistics,":[107],"real-time":[108],"customer":[109],"services,":[110],"AI-driven":[112],"IoT":[113],"platforms,":[114],"offers":[116],"empirical":[117],"insights":[118],"can":[120],"inform":[121],"future":[122],"system-level":[123],"optimization":[125],"research.":[126],"This":[127],"paper":[128],"describes":[129],"the":[130],"architecture":[132],"its":[134,138],"core":[135],"components,":[136],"evaluates":[137],"performance":[139],"against":[140],"strong":[141],"baselines,":[142],"demonstrates":[144],"how":[145],"it":[146],"enables":[147],"scalable":[148],"practical":[150],"AI":[151],"services":[152]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-02-06T00:00:00"}
