{"id":"https://openalex.org/W4409048416","doi":"https://doi.org/10.1145/3721146.3721947","title":"Performance Aware LLM Load Balancer for Mixed Workloads","display_name":"Performance Aware LLM Load Balancer for Mixed Workloads","publication_year":2025,"publication_date":"2025-03-30","ids":{"openalex":"https://openalex.org/W4409048416","doi":"https://doi.org/10.1145/3721146.3721947"},"language":"en","primary_location":{"id":"doi:10.1145/3721146.3721947","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3721146.3721947","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3721146.3721947","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 5th Workshop on Machine Learning and Systems","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3721146.3721947","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108134529","display_name":"Kunal Jain","orcid":"https://orcid.org/0009-0009-2617-6251"},"institutions":[{"id":"https://openalex.org/I4210124949","display_name":"Microsoft Research (India)","ror":"https://ror.org/02w7f3w92","country_code":"IN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210124949"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"Kunal Jain","raw_affiliation_strings":["Microsoft, Bangalore, India"],"raw_orcid":"https://orcid.org/0009-0009-2617-6251","affiliations":[{"raw_affiliation_string":"Microsoft, Bangalore, India","institution_ids":["https://openalex.org/I4210124949"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065463542","display_name":"Anjaly Parayil","orcid":"https://orcid.org/0000-0002-6296-0395"},"institutions":[{"id":"https://openalex.org/I4210124949","display_name":"Microsoft Research (India)","ror":"https://ror.org/02w7f3w92","country_code":"IN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210124949"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Anjaly Parayil","raw_affiliation_strings":["Microsoft Research, Bangalore, India"],"raw_orcid":"https://orcid.org/0000-0002-6296-0395","affiliations":[{"raw_affiliation_string":"Microsoft Research, Bangalore, India","institution_ids":["https://openalex.org/I4210124949"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021638015","display_name":"Ankur Mallick","orcid":"https://orcid.org/0009-0009-7068-5627"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ankur Mallick","raw_affiliation_strings":["Microsoft, Redmond, USA"],"raw_orcid":"https://orcid.org/0009-0009-7068-5627","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045708568","display_name":"Esha Choukse","orcid":"https://orcid.org/0000-0003-0371-5522"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Esha Choukse","raw_affiliation_strings":["Microsoft, Redmond, USA"],"raw_orcid":"https://orcid.org/0000-0003-0371-5522","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101848858","display_name":"Xiaoting Qin","orcid":"https://orcid.org/0000-0003-3631-9024"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoting Qin","raw_affiliation_strings":["Microsoft Research, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-3631-9024","affiliations":[{"raw_affiliation_string":"Microsoft Research, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100752551","display_name":"Jue Zhang","orcid":"https://orcid.org/0000-0003-0472-9168"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jue Zhang","raw_affiliation_strings":["Microsoft Research, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-0472-9168","affiliations":[{"raw_affiliation_string":"Microsoft Research, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090311560","display_name":"\u00cd\u00f1igo Goiri","orcid":"https://orcid.org/0000-0003-2591-4012"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"\u00cd\u00f1igo Goiri","raw_affiliation_strings":["Microsoft, Redmond, USA"],"raw_orcid":"https://orcid.org/0000-0003-2591-4012","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073855975","display_name":"Rujia Wang","orcid":"https://orcid.org/0000-0003-4019-5327"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rujia Wang","raw_affiliation_strings":["Microsoft, Chicago, USA"],"raw_orcid":"https://orcid.org/0000-0003-4019-5327","affiliations":[{"raw_affiliation_string":"Microsoft, Chicago, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101967802","display_name":"Chetan Bansal","orcid":"https://orcid.org/0000-0003-0102-8139"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chetan Bansal","raw_affiliation_strings":["Microsoft, Redmond, USA"],"raw_orcid":"https://orcid.org/0000-0003-0102-8139","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049145726","display_name":"Victor R\u00fchle","orcid":"https://orcid.org/0000-0002-8957-7628"},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Victor R\u00fchle","raw_affiliation_strings":["Microsoft Research, Cambridge, United Kingdom"],"raw_orcid":"https://orcid.org/0000-0002-8957-7628","affiliations":[{"raw_affiliation_string":"Microsoft Research, Cambridge, United Kingdom","institution_ids":["https://openalex.org/I4210164937"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111335199","display_name":"Anoop Kulkarni","orcid":"https://orcid.org/0009-0006-4412-1252"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anoop Kulkarni","raw_affiliation_strings":["Microsoft, Redmond, USA"],"raw_orcid":"https://orcid.org/0009-0006-4412-1252","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107526366","display_name":"Steve Kofsky","orcid":"https://orcid.org/0009-0001-8558-5954"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Steve Kofsky","raw_affiliation_strings":["Microsoft, Redmond, USA"],"raw_orcid":"https://orcid.org/0009-0001-8558-5954","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108894474","display_name":"Saravan Rajmohan","orcid":"https://orcid.org/0009-0003-0204-7187"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Saravan Rajmohan","raw_affiliation_strings":["Microsoft 365, Redmond, USA"],"raw_orcid":"https://orcid.org/0009-0003-0204-7187","affiliations":[{"raw_affiliation_string":"Microsoft 365, Redmond, USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":4,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5108134529"],"corresponding_institution_ids":["https://openalex.org/I4210124949"],"apc_list":null,"apc_paid":null,"fwci":6.662,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.96396559,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"19","last_page":"30"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8117060661315918},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.42595335841178894},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.383484810590744},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.36421358585357666}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8117060661315918},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.42595335841178894},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.383484810590744},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.36421358585357666}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3721146.3721947","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3721146.3721947","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3721146.3721947","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 5th Workshop on Machine Learning and Systems","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3721146.3721947","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3721146.3721947","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3721146.3721947","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 5th Workshop on Machine Learning and Systems","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4409048416.pdf"},"referenced_works_count":12,"referenced_works":["https://openalex.org/W2115495728","https://openalex.org/W2524643686","https://openalex.org/W2760505947","https://openalex.org/W2950681488","https://openalex.org/W2963748441","https://openalex.org/W3155584966","https://openalex.org/W3172461472","https://openalex.org/W4281758439","https://openalex.org/W4313535045","https://openalex.org/W4387321091","https://openalex.org/W4394907705","https://openalex.org/W6797434948"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Large":[0],"Language":[1],"Model":[2],"(LLM)":[3],"workloads":[4,36],"consist":[5],"of":[6,129],"distinct":[7],"prefill":[8],"and":[9,16,54,121,157,173],"decode":[10],"phases,":[11],"each":[12],"with":[13],"unique":[14],"compute":[15],"memory":[17],"requirements":[18],"that":[19,74,190],"should":[20],"be":[21],"considered":[22],"when":[23],"routing":[24],"input":[25],"queries":[26,111],"across":[27,78,112],"cluster":[28],"instances.":[29],"However,":[30],"existing":[31,140],"load-balancing":[32],"algorithms":[33],"treat":[34],"these":[35,95],"as":[37,159],"monolithic":[38],"jobs,":[39],"ignoring":[40],"the":[41,44,64,90,127,180,191],"differences":[42],"between":[43],"two":[45],"phases.":[46],"This":[47],"oversight":[48],"leads":[49],"to":[50,169,183],"suboptimal":[51],"query":[52],"distribution":[53],"increased":[55],"response":[56,67],"latency.":[57],"In":[58],"our":[59],"work,":[60],"we":[61,97,177],"first":[62,150],"characterize":[63],"factors":[65],"affecting":[66],"latency":[68,85,138],"during":[69],"LLM":[70,80,113],"inference.":[71],"We":[72],"show":[73],"balancing":[75,166],"inference":[76],"requests":[77],"available":[79],"instances":[81,114],"can":[82,178],"improve":[83],"end-to-end":[84,137],"more":[86],"than":[87,139],"simply":[88],"optimizing":[89],"instance-level":[91],"scheduler.":[92],"Motivated":[93],"by":[94,115],"findings,":[96],"propose":[98],"a":[99,117,122,149,153,160],"heuristic-guided,":[100],"reinforcement":[101],"learning-based":[102],"router":[103,109],"for":[104,125,162,185],"data-driven,":[105],"workload-aware":[106],"scheduling.":[107],"Our":[108,146],"distributes":[110],"using":[116],"trainable":[118],"response-length":[119],"predictor":[120],"novel":[123],"formulation":[124],"estimating":[126],"impact":[128],"mixing":[130],"different":[131,170],"workloads,":[132],"achieving":[133],"over":[134],"11%":[135],"lower":[136],"methods":[141],"on":[142],"mixed":[143],"public":[144],"datasets.":[145],"framework":[147,156,182],"represents":[148],"step":[151],"toward":[152],"holistic":[154],"optimization":[155],"serves":[158],"benchmark":[161],"deriving":[163],"optimal":[164],"load":[165],"strategies":[167],"tailored":[168],"reward":[171],"functions":[172],"requirements.":[174],"Beyond":[175],"latency,":[176],"extend":[179],"proposed":[181],"optimize":[184],"various":[186],"performance":[187],"criteria":[188],"ensuring":[189],"system":[192],"meets":[193],"diverse":[194],"operational":[195],"objectives.":[196]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":3}],"updated_date":"2026-04-30T09:15:22.047038","created_date":"2025-10-10T00:00:00"}
