{"id":"https://openalex.org/W4402701382","doi":"https://doi.org/10.48550/arxiv.2408.13510","title":"Intelligent Router for LLM Workloads: Improving Performance Through Workload-Aware Load Balancing","display_name":"Intelligent Router for LLM Workloads: Improving Performance Through Workload-Aware Load Balancing","publication_year":2024,"publication_date":"2024-08-24","ids":{"openalex":"https://openalex.org/W4402701382","doi":"https://doi.org/10.48550/arxiv.2408.13510"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2408.13510","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.13510","pdf_url":"https://arxiv.org/pdf/2408.13510","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2408.13510","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059641905","display_name":"Kunal Jain","orcid":"https://orcid.org/0009-0001-6346-9055"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jain, Kunal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065463542","display_name":"Anjaly Parayil","orcid":"https://orcid.org/0000-0002-6296-0395"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Parayil, Anjaly","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021638015","display_name":"Ankur Mallick","orcid":"https://orcid.org/0009-0009-7068-5627"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mallick, Ankur","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045708568","display_name":"Esha Choukse","orcid":"https://orcid.org/0000-0003-0371-5522"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choukse, Esha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101848858","display_name":"Xiaoting Qin","orcid":"https://orcid.org/0000-0003-3631-9024"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Xiaoting","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100752550","display_name":"Jue Zhang","orcid":"https://orcid.org/0000-0003-0440-1357"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090311560","display_name":"\u00cd\u00f1igo Goiri","orcid":"https://orcid.org/0000-0003-2591-4012"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goiri, \u00cd\u00f1igo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022003085","display_name":"Rujia Wang","orcid":"https://orcid.org/0000-0003-2424-6569"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Rujia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101967802","display_name":"Chetan Bansal","orcid":"https://orcid.org/0000-0003-0102-8139"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bansal, Chetan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049145726","display_name":"Victor R\u00fchle","orcid":"https://orcid.org/0000-0002-8957-7628"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"R\u00fchle, Victor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111335199","display_name":"Anoop Kulkarni","orcid":"https://orcid.org/0009-0006-4412-1252"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kulkarni, Anoop","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107526366","display_name":"Steve Kofsky","orcid":"https://orcid.org/0009-0001-8558-5954"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kofsky, Steve","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5108894474","display_name":"Saravan Rajmohan","orcid":"https://orcid.org/0009-0003-0204-7187"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rajmohan, Saravan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5059641905"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.9745000004768372,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.9745000004768372,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9726999998092651,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12203","display_name":"Mobile Agent-Based Network Management","score":0.9657999873161316,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.7666410207748413},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7475802898406982},{"id":"https://openalex.org/keywords/router","display_name":"Router","score":0.7465141415596008},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5752089023590088},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.40669122338294983},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.36394333839416504},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.35406768321990967},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.35277122259140015},{"id":"https://openalex.org/keywords/operations-management","display_name":"Operations management","score":0.10489648580551147},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.1015489399433136}],"concepts":[{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.7666410207748413},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7475802898406982},{"id":"https://openalex.org/C2775896111","wikidata":"https://www.wikidata.org/wiki/Q642560","display_name":"Router","level":2,"score":0.7465141415596008},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5752089023590088},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.40669122338294983},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.36394333839416504},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.35406768321990967},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.35277122259140015},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.10489648580551147},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.1015489399433136}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2408.13510","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.13510","pdf_url":"https://arxiv.org/pdf/2408.13510","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2408.13510","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2408.13510","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2408.13510","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.13510","pdf_url":"https://arxiv.org/pdf/2408.13510","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4402701382.pdf","grobid_xml":"https://content.openalex.org/works/W4402701382.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2000785801","https://openalex.org/W986318368","https://openalex.org/W2384410913","https://openalex.org/W2352878646","https://openalex.org/W2122026593","https://openalex.org/W2004734601","https://openalex.org/W2130149817","https://openalex.org/W2990194547","https://openalex.org/W1480123525","https://openalex.org/W2620865396"],"abstract_inverted_index":{"Large":[0],"Language":[1],"Model":[2],"(LLM)":[3],"workloads":[4,40,153],"have":[5],"distinct":[6,47],"prefill":[7],"and":[8,14,61,126,142,154,170,182,216],"decode":[9],"phases":[10,52],"with":[11,179],"different":[12,28,152,201],"compute":[13],"memory":[15],"requirements":[16],"which":[17],"should":[18],"ideally":[19],"be":[20],"accounted":[21],"for":[22,124,146,199,211],"when":[23],"scheduling":[24,36,60],"input":[25,181],"queries":[26,132],"across":[27,90,133],"LLM":[29,39,78,93,134,202],"instances":[30,94,135],"in":[31,53],"a":[32,101,118,138,143,165,197,212],"cluster.":[33],"However":[34],"existing":[35,162],"algorithms":[37],"treat":[38],"as":[41,196],"monolithic":[42],"jobs":[43],"without":[44],"considering":[45],"the":[46,50,74,91,97,109,148,190,208],"characteristics":[48],"of":[49,87,150,167],"two":[51],"each":[54],"workload.":[55],"This":[56],"leads":[57],"to":[58,100],"sub-optimal":[59],"increased":[62],"response":[63,75],"latency.":[64],"In":[65],"this":[66],"work,":[67],"we":[68,116],"start":[69],"by":[70,113,136],"characterizing":[71],"factors":[72],"affecting":[73],"latency":[76,99,160,174,210],"during":[77],"inference":[79,88,203],"serving.":[80],"We":[81],"establish":[82],"that":[83],"better":[84],"load":[85],"balancing":[86],"requests":[89],"available":[92],"can":[95,193],"improve":[96],"end-to-end":[98,159,173],"larger":[102],"extent":[103],"than":[104,161],"merely":[105],"focusing":[106],"on":[107,164,175],"optimizing":[108],"instance-level":[110,217],"scheduler.":[111],"Motivated":[112],"our":[114],"findings,":[115],"propose":[117],"heuristic-guided":[119],"reinforcement":[120],"learning-based":[121],"intelligent":[122],"router":[123,130],"data-driven":[125],"workload-aware":[127],"scheduling.":[128],"Our":[129],"schedules":[131],"leveraging":[137],"trainable":[139],"response-length":[140],"predictor,":[141],"novel":[144],"formulation":[145],"estimating":[147],"impact":[149],"mixing":[151],"achieves":[155],"over":[156],"11%":[157],"lower":[158,172],"approaches":[163],"mix":[166],"public":[168],"datasets":[169],"7.8%":[171],"real":[176],"workload":[177],"data":[178],"diverse":[180],"output":[183],"trends":[184],"from":[185],"Cloud":[186],"Provider":[187],"X.":[188],"Additionally,":[189],"proposed":[191],"framework":[192],"also":[194],"serve":[195],"standard":[198],"benchmarking":[200],"schedulers":[204],"since":[205],"it":[206],"provides":[207],"best":[209],"given":[213],"model,":[214],"hardware,":[215],"scheduler":[218],"combination.":[219]},"counts_by_year":[],"updated_date":"2026-03-09T08:58:05.943551","created_date":"2024-09-21T00:00:00"}
