{"id":"https://openalex.org/W7160661707","doi":"https://doi.org/10.48550/arxiv.2605.06113","title":"Tackling the Data-Parallel Load Balancing Bottleneck in LLM Serving: Practical Online Routing at Scale","display_name":"Tackling the Data-Parallel Load Balancing Bottleneck in LLM Serving: Practical Online Routing at Scale","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160661707","doi":"https://doi.org/10.48550/arxiv.2605.06113"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.06113","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06113","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.06113","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123931370","display_name":"Tianci Bu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bu, Tianci","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124449680","display_name":"Yuan Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Yuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123990407","display_name":"Zixi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zixi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123923142","display_name":"Chendong Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Chendong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135688421","display_name":"Hong Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Hong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135707812","display_name":"Tsepten Gurung","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gurung, Tsepten","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124403732","display_name":"Yuwei Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Yuwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135674681","display_name":"Yinyu Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Yinyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135712452","display_name":"Zijie Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zijie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.31929999589920044,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.31929999589920044,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.3190000057220459,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.07360000163316727,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.8014000058174133},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6211000084877014},{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.5647000074386597},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.527899980545044},{"id":"https://openalex.org/keywords/router","display_name":"Router","score":0.4862000048160553},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.47600001096725464},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4244999885559082},{"id":"https://openalex.org/keywords/routing","display_name":"Routing (electronic design automation)","score":0.41830000281333923},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.38940000534057617}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8133999705314636},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.8014000058174133},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6211000084877014},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.6100999712944031},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.5647000074386597},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.527899980545044},{"id":"https://openalex.org/C2775896111","wikidata":"https://www.wikidata.org/wiki/Q642560","display_name":"Router","level":2,"score":0.4862000048160553},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.47600001096725464},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4244999885559082},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.41830000281333923},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.39070001244544983},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.38940000534057617},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.3594000041484833},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.34850001335144043},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.33239999413490295},{"id":"https://openalex.org/C2984173633","wikidata":"https://www.wikidata.org/wiki/Q22725","display_name":"Routing algorithm","level":4,"score":0.3172999918460846},{"id":"https://openalex.org/C43711488","wikidata":"https://www.wikidata.org/wiki/Q7534783","display_name":"Skew","level":2,"score":0.3167000114917755},{"id":"https://openalex.org/C22684755","wikidata":"https://www.wikidata.org/wiki/Q847526","display_name":"Queueing theory","level":2,"score":0.3109999895095825},{"id":"https://openalex.org/C109751979","wikidata":"https://www.wikidata.org/wiki/Q998767","display_name":"Failover","level":2,"score":0.30309998989105225},{"id":"https://openalex.org/C160403385","wikidata":"https://www.wikidata.org/wiki/Q220543","display_name":"Queue","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.27950000762939453},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.2621999979019165},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.25920000672340393},{"id":"https://openalex.org/C196921405","wikidata":"https://www.wikidata.org/wiki/Q786431","display_name":"Online algorithm","level":2,"score":0.25609999895095825},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.2531999945640564},{"id":"https://openalex.org/C42812","wikidata":"https://www.wikidata.org/wiki/Q1082910","display_name":"Partition (number theory)","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.06113","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06113","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.06113","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06113","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.6563581824302673}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Data-parallel":[0],"(DP)":[1],"load":[2],"balancing":[3],"has":[4,87],"emerged":[5],"as":[6],"a":[7,15,40,65,88,105,121,141,162,178,184,192,199,209],"first-order":[8],"bottleneck":[9],"in":[10,39],"large-scale":[11],"LLM":[12,79],"serving.":[13],"When":[14],"model":[16],"is":[17,45,73],"sharded":[18],"across":[19,31,57],"devices":[20],"via":[21],"tensor":[22],"parallelism":[23,27],"(TP)":[24],"or":[25],"expert":[26],"(EP)":[28],"and":[29,99,114,139,155,183,202,213,227],"replicated":[30],"many":[32],"DP":[33,58,225],"workers,":[34],"every":[35],"decode":[36,107],"step":[37,61],"ends":[38],"synchronization":[41],"barrier":[42],"whose":[43],"latency":[44],"set":[46],"by":[47],"the":[48,100,146,160,189,214],"most":[49],"heavily":[50],"loaded":[51],"worker;":[52],"even":[53],"modest":[54],"persistent":[55],"imbalance":[56,226],"workers":[59],"compounds,":[60],"after":[62],"step,":[63],"into":[64,159],"substantial":[66],"fraction":[67],"of":[68,111,116,123],"wasted":[69],"compute.":[70],"The":[71,132,172],"problem":[72],"hard":[74],"for":[75],"reasons":[76],"specific":[77],"to":[78,191],"decoding:":[80],"assignments":[81],"are":[82,97],"sticky":[83],"(migrating":[84],"KV":[85],"caches":[86],"high":[89],"cost),":[90],"per-request":[91],"loads":[92],"grow":[93],"over":[94,109],"time,":[95],"arrivals":[96],"non-stationary,":[98],"router":[101],"must":[102],"decide":[103],"within":[104],"sub-100\\,ms":[106],"budget":[108],"hundreds":[110],"waiting":[112],"requests":[113],"tens":[115],"workers.":[117],"We":[118,195],"present":[119],"\\textbf{BalanceRoute},":[120],"family":[122],"practical":[124],"online":[125],"routing":[126],"algorithms":[127],"that":[128,144,151,157],"target":[129],"this":[130],"bottleneck.":[131],"first,":[133],"\\textbf{BR-0},":[134],"requires":[135],"no":[136],"prediction":[137],"infrastructure":[138],"uses":[140],"piecewise-linear":[142],"F-score":[143,190],"captures":[145],"sharp":[147],"asymmetry":[148],"between":[149],"admissions":[150],"fill":[152],"safe":[153],"margin":[154],"those":[156],"overflow":[158],"envelope;":[161],"two-stage":[163],"decomposition":[164],"keeps":[165],"per-step":[166],"cost":[167],"compatible":[168],"with":[169,177],"millisecond-scale":[170],"scheduling.":[171],"second,":[173],"\\textbf{BR-H},":[174],"generalizes":[175],"BR-0":[176],"short,":[179],"constant":[180],"lookahead":[181],"$H$":[182],"lightweight":[185],"termination-classifier":[186],"interface,":[187],"extending":[188],"horizon-discounted":[193],"form.":[194],"deploy":[196],"BalanceRoute":[197,221],"on":[198,207],"144-NPU":[200],"cluster":[201],"evaluate":[203],"against":[204],"vLLM":[205],"baselines":[206],"both":[208,219],"proprietary":[210],"production":[211],"trace":[212],"public":[215],"Azure-2024":[216],"trace.":[217],"Across":[218],"workloads,":[220],"substantially":[222],"reduces":[223],"average":[224],"improves":[228],"end-to-end":[229],"serving":[230],"throughput.":[231]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-09T00:00:00"}
