{"id":"https://openalex.org/W7133549032","doi":"https://doi.org/10.1109/hpca68181.2026.11408548","title":"Towards Resource-Efficient Serverless LLM Inference with SLINFER","display_name":"Towards Resource-Efficient Serverless LLM Inference with SLINFER","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7133549032","doi":"https://doi.org/10.1109/hpca68181.2026.11408548"},"language":null,"primary_location":{"id":"doi:10.1109/hpca68181.2026.11408548","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408548","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010369591","display_name":"Chuhao Xu","orcid":"https://orcid.org/0009-0003-8267-4614"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chuhao Xu","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092687787","display_name":"Zijun LI","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zijun Li","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100377840","display_name":"Quan Chen","orcid":"https://orcid.org/0000-0001-5832-0347"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Quan Chen","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128094920","display_name":"Han Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Han Zhao","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015527156","display_name":"Xueyan Tang","orcid":"https://orcid.org/0000-0002-7404-7595"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Xueyan Tang","raw_affiliation_strings":["Nanyang Technological University"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5128045567","display_name":"Minyi Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minyi Guo","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5010369591"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.4043689,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"18"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.35929998755455017,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.35929998755455017,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.10899999737739563,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.07980000227689743,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.37279999256134033},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.27300000190734863},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.2639999985694885},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.25760000944137573},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.2434999942779541}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6150000095367432},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48669999837875366},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.37279999256134033},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.25760000944137573},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2549000084400177},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2515000104904175},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.2434999942779541},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.23720000684261322}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca68181.2026.11408548","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408548","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.47450345754623413,"id":"https://metadata.un.org/sdg/8","display_name":"Decent work and economic growth"}],"awards":[{"id":"https://openalex.org/G4179261564","display_name":null,"funder_award_id":"62232011,62302302","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W2783772867","https://openalex.org/W3130689885","https://openalex.org/W3130716829","https://openalex.org/W3210617645","https://openalex.org/W4214610523","https://openalex.org/W4214690606","https://openalex.org/W4214764640","https://openalex.org/W4220980777","https://openalex.org/W4387321091","https://openalex.org/W4388040908","https://openalex.org/W4388989525","https://openalex.org/W4394998727","https://openalex.org/W4394998969","https://openalex.org/W4395020691","https://openalex.org/W4395106452","https://openalex.org/W4395112660","https://openalex.org/W4401211704","https://openalex.org/W4402671766","https://openalex.org/W4403337153","https://openalex.org/W4404401017","https://openalex.org/W4404401018","https://openalex.org/W4406266041","https://openalex.org/W4407217670","https://openalex.org/W4407218580","https://openalex.org/W4407218922","https://openalex.org/W4411486062","https://openalex.org/W4412875469"],"related_works":[],"abstract_inverted_index":{"The":[0],"rise":[1],"of":[2,46],"LLMs":[3,48,60,75],"has":[4],"driven":[5],"demand":[6],"for":[7,71],"private":[8],"serverless":[9,20,67],"deployments,":[10],"characterized":[11],"by":[12,150],"moderate-sized":[13],"models":[14],"and":[15,35,52,55,79,106,115,119,131,140],"infrequent":[16],"requests.":[17],"While":[18],"existing":[19],"solutions":[21],"follow":[22],"exclusive":[23],"GPU":[24],"allocation,":[25],"we":[26],"take":[27],"a":[28,65,104,121],"step":[29],"back":[30],"to":[31,73,98,111,162],"explore":[32],"modern":[33],"platforms":[34],"find":[36],"that:":[37],"Emerging":[38],"CPU":[39],"architectures":[40],"with":[41],"built-in":[42],"accelerators":[43],"are":[44],"capable":[45],"serving":[47,148],"but":[49],"remain":[50],"underutilized,":[51],"both":[53],"CPUs":[54,139,159],"GPUs":[56,143],"can":[57],"accommodate":[58],"multiple":[59],"simultaneously.":[61],"We":[62],"propose":[63],"SLINFER,":[64],"resource-efficient":[66],"inference":[68],"scheme":[69],"tailored":[70],"small-":[72],"mid-sized":[74],"that":[76,124,145],"enables":[77],"elastic":[78],"on-demand":[80],"sharing":[81],"across":[82],"heterogeneous":[83],"hardware.":[84],"SLINFER":[85,146],"tackles":[86],"three":[87],"fundamental":[88],"challenges:":[89],"(1)":[90],"precise,":[91],"fine-grained":[92],"compute":[93],"resource":[94],"allocation":[95],"at":[96],"token-level":[97],"handle":[99],"fluctuating":[100],"computational":[101],"demands;":[102],"(2)":[103],"coordinated":[105],"forward-looking":[107],"memory":[108],"scaling":[109],"mechanism":[110],"detect":[112],"out-ofmemory":[113],"hazards":[114],"reduce":[116],"operational":[117],"overhead;":[118],"(3)":[120],"dual":[122],"approach":[123],"consolidates":[125],"fragmented":[126],"instances":[127],"through":[128,154],"proactive":[129],"preemption":[130],"reactive":[132],"bin-packing.":[133],"Experimental":[134],"results":[135],"on":[136],"4":[137,141],"32-core":[138],"A100":[142],"show":[144],"improves":[147],"capacity":[149],"47%":[151],"-":[152,164],"62%":[153],"sharing,":[155],"while":[156],"further":[157],"leveraging":[158],"boosts":[160],"this":[161],"86%":[163],"154%.":[165]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-03-05T00:00:00"}
