{"id":"https://openalex.org/W4416927406","doi":"https://doi.org/10.1145/3771576","title":"SAGESERVE: Optimizing LLM Serving on Cloud Data Centers with Forecast Aware Auto-Scaling","display_name":"SAGESERVE: Optimizing LLM Serving on Cloud Data Centers with Forecast Aware Auto-Scaling","publication_year":2025,"publication_date":"2025-12-01","ids":{"openalex":"https://openalex.org/W4416927406","doi":"https://doi.org/10.1145/3771576"},"language":"en","primary_location":{"id":"doi:10.1145/3771576","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3771576","pdf_url":null,"source":{"id":"https://openalex.org/S4210193547","display_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","issn_l":"2476-1249","issn":["2476-1249"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1145/3771576","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054650531","display_name":"Shashwat Jaiswal","orcid":"https://orcid.org/0000-0002-2526-5780"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Shashwat Jaiswal","raw_affiliation_strings":["University of Illinois, Urbana-Champaign, USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois, Urbana-Champaign, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108134529","display_name":"Kunal Jain","orcid":"https://orcid.org/0009-0009-2617-6251"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kunal Jain","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041794289","display_name":"Yogesh Simmhan","orcid":"https://orcid.org/0000-0003-4140-7774"},"institutions":[{"id":"https://openalex.org/I59270414","display_name":"Indian Institute of Science Bangalore","ror":"https://ror.org/04dese585","country_code":"IN","type":"education","lineage":["https://openalex.org/I59270414"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Yogesh Simmhan","raw_affiliation_strings":["Indian Institute of Science, Bangalore, India"],"affiliations":[{"raw_affiliation_string":"Indian Institute of Science, Bangalore, India","institution_ids":["https://openalex.org/I59270414"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065463542","display_name":"Anjaly Parayil","orcid":"https://orcid.org/0000-0002-6296-0395"},"institutions":[{"id":"https://openalex.org/I4210124949","display_name":"Microsoft Research (India)","ror":"https://ror.org/02w7f3w92","country_code":"IN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210124949"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Anjaly Parayil","raw_affiliation_strings":["Microsoft Research, Bangalore, India"],"affiliations":[{"raw_affiliation_string":"Microsoft Research, Bangalore, India","institution_ids":["https://openalex.org/I4210124949"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021638015","display_name":"Ankur Mallick","orcid":"https://orcid.org/0009-0009-7068-5627"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ankur Mallick","raw_affiliation_strings":["Microsoft, Redmond, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073855975","display_name":"Rujia Wang","orcid":"https://orcid.org/0000-0003-4019-5327"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rujia Wang","raw_affiliation_strings":["Microsoft, Redmond, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068951662","display_name":"Ren\u00e9e St. Amant","orcid":"https://orcid.org/0009-0006-9387-5886"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Renee St. Amant","raw_affiliation_strings":["Microsoft, Redmond, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101967802","display_name":"Chetan Bansal","orcid":"https://orcid.org/0000-0003-0102-8139"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chetan Bansal","raw_affiliation_strings":["Microsoft, Redmond, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049145726","display_name":"Victor R\u00fchle","orcid":"https://orcid.org/0000-0002-8957-7628"},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Victor Ruhle","raw_affiliation_strings":["Microsoft Research, Cambridge, United Kingdom"],"affiliations":[{"raw_affiliation_string":"Microsoft Research, Cambridge, United Kingdom","institution_ids":["https://openalex.org/I4210164937"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111335199","display_name":"Anoop Kulkarni","orcid":"https://orcid.org/0009-0006-4412-1252"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anoop Kulkarni","raw_affiliation_strings":["Microsoft, Redmond, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107526366","display_name":"Steve Kofsky","orcid":"https://orcid.org/0009-0001-8558-5954"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Steve Kofsky","raw_affiliation_strings":["Microsoft, Redmond, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108894474","display_name":"Saravan Rajmohan","orcid":"https://orcid.org/0009-0003-0204-7187"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Saravan Rajmohan","raw_affiliation_strings":["Microsoft 365, Redmond, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft 365, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5054650531"],"corresponding_institution_ids":["https://openalex.org/I157725225"],"apc_list":null,"apc_paid":null,"fwci":3.5342,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.94956973,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":"9","issue":"3","first_page":"1","last_page":"24"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.41780000925064087,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.41780000925064087,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.2630999982357025,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.08540000021457672,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.756600022315979},{"id":"https://openalex.org/keywords/data-center","display_name":"Data center","score":0.5414000153541565},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5170000195503235},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.5105000138282776},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.5090000033378601},{"id":"https://openalex.org/keywords/resource-allocation","display_name":"Resource allocation","score":0.49160000681877136},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.4429999887943268},{"id":"https://openalex.org/keywords/routing","display_name":"Routing (electronic design automation)","score":0.40059998631477356}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.824400007724762},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.756600022315979},{"id":"https://openalex.org/C153740404","wikidata":"https://www.wikidata.org/wiki/Q671224","display_name":"Data center","level":2,"score":0.5414000153541565},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5170000195503235},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.5105000138282776},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.5090000033378601},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.49160000681877136},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.4429999887943268},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.413100004196167},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.40059998631477356},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3837999999523163},{"id":"https://openalex.org/C2780378061","wikidata":"https://www.wikidata.org/wiki/Q25351891","display_name":"Service (business)","level":2,"score":0.3707999885082245},{"id":"https://openalex.org/C116537","wikidata":"https://www.wikidata.org/wiki/Q2169973","display_name":"Service provider","level":3,"score":0.3522999882698059},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.33660000562667847},{"id":"https://openalex.org/C172191483","wikidata":"https://www.wikidata.org/wiki/Q1071806","display_name":"Provisioning","level":2,"score":0.3158999979496002},{"id":"https://openalex.org/C2780609101","wikidata":"https://www.wikidata.org/wiki/Q17156588","display_name":"Resource management (computing)","level":2,"score":0.3082999885082245},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.2957000136375427},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2915000021457672},{"id":"https://openalex.org/C56086750","wikidata":"https://www.wikidata.org/wiki/Q6042592","display_name":"Integer programming","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C181889124","wikidata":"https://www.wikidata.org/wiki/Q380204","display_name":"Service level","level":2,"score":0.2815000116825104},{"id":"https://openalex.org/C41045048","wikidata":"https://www.wikidata.org/wiki/Q202843","display_name":"Linear programming","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2646999955177307},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3771576","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3771576","pdf_url":null,"source":{"id":"https://openalex.org/S4210193547","display_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","issn_l":"2476-1249","issn":["2476-1249"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3771576","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3771576","pdf_url":null,"source":{"id":"https://openalex.org/S4210193547","display_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","issn_l":"2476-1249","issn":["2476-1249"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Measurement and Analysis of Computing Systems","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W2115743854","https://openalex.org/W2148777482","https://openalex.org/W2907606578","https://openalex.org/W2963926728","https://openalex.org/W3023238978","https://openalex.org/W4387321091","https://openalex.org/W4394998727","https://openalex.org/W4401211704","https://openalex.org/W4404401017","https://openalex.org/W4405396166","https://openalex.org/W4406325768","https://openalex.org/W4408894415","https://openalex.org/W4409048416","https://openalex.org/W4409325865"],"related_works":[],"abstract_inverted_index":{"Global":[0],"cloud":[1,112],"service":[2],"providers":[3],"handle":[4],"inference":[5,46],"workloads":[6,37,97,125],"for":[7],"Large":[8],"Language":[9],"Models":[10],"(LLMs)":[11],"that":[12,160],"span":[13],"latency-sensitive":[14],"(e.g.,":[15,19],"chatbots)":[16],"and":[17,26,55,65,120,131,184,191,195,204,217,228,246,272,281],"insensitive":[18],"report":[20],"writing)":[21],"tasks,":[22],"resulting":[23,256],"in":[24,126,238,257],"diverse":[25],"often":[27,61],"conflicting":[28],"Service":[29],"Level":[30],"Agreement":[31],"(SLA)":[32],"requirements.":[33],"Managing":[34],"such":[35,63,140],"mixed":[36],"is":[38,135],"challenging":[39],"due":[40,85,250],"to":[41,79,86,151,163,175,235,241,251,265],"the":[42,45,94,104,138,193,242,282],"complexity":[43],"of":[44,82,103,107,137,143,181,263],"serving":[47,96,158],"stack,":[48],"which":[49],"encompasses":[50],"multiple":[51],"models,":[52],"GPU":[53,70,182],"hardware,":[54],"global":[56],"data":[57,128,176],"centers.":[58],"Existing":[59],"solutions":[60],"silo":[62],"fast":[64],"slow":[66],"tasks":[67],"onto":[68],"separate":[69],"resource":[71,196],"pools":[72],"with":[73,113,178,187],"different":[74,127],"SLAs,":[75],"but":[76],"this":[77,90],"leads":[78],"significant":[80],"under-utilization":[81],"expensive":[83],"accelerators":[84],"load":[87],"mismatch.":[88],"In":[89],"article,":[91],"we":[92],"characterize":[93],"LLM":[95,145,157],"at":[98,287],"Microsoft":[99,110],"Office":[100],"365,":[101],"one":[102,136],"largest":[105],"users":[106],"LLMs":[108],"within":[109],"Azure":[111],"over":[114],"10":[115,221],"million":[116,222],"requests":[117,224],"per":[118],"day,":[119],"highlight":[121],"key":[122],"observations":[123],"across":[124,132,225],"center":[129],"regions":[130,227],"time.":[133],"This":[134],"first":[139],"public":[141],"studies":[142],"Internet-scale":[144],"workloads.":[146],"We":[147,211,232],"use":[148],"these":[149],"insights":[150],"propose":[152],"SageServe":[153,213,283],",":[154],"a":[155,200,258],"comprehensive":[156],"framework":[159],"dynamically":[161],"adapts":[162],"workload":[164,276],"demands":[165],"using":[166,199],"multi-timescale":[167],"control":[168],"knobs.":[169],"It":[170],"combines":[171],"short-term":[172],"request":[173],"routing":[174,194],"centers":[177],"long-term":[179],"scaling":[180],"VMs":[183],"model":[185,203],"placement":[186],"higher":[188],"lead":[189],"times,":[190],"co-optimizes":[192],"allocation":[197],"problem":[198],"traffic":[201],"forecast":[202],"an":[205],"Integer":[206],"Linear":[207],"Programming":[208],"(ILP)":[209],"solution.":[210],"evaluate":[212],"through":[214],"real":[215],"runs":[216],"realistic":[218],"simulations":[219],"on":[220],"production":[223],"three":[226],"four":[229],"open-source":[230],"models.":[231],"achieve":[233],"up":[234,264],"25%":[236],"savings":[237,262],"GPU-hours":[239],"compared":[240],"current":[243],"baseline":[244],"deployment":[245],"reduce":[247],"GPU-hour":[248],"wastage":[249],"inefficient":[252],"auto-scaling":[253],"by":[254],"80%,":[255],"potential":[259],"monthly":[260],"cost":[261],"$2.5":[266],"million,":[267],"while":[268],"maintaining":[269],"tail":[270],"latency":[271],"meeting":[273],"SLAs.":[274],"The":[275],"traces,":[277],"our":[278],"simulator":[279],"harness":[280],"scheduler":[284],"are":[285],"available":[286],"https://github.com/shashwatj07/SageServe.":[288]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-12-02T00:00:00"}
