{"id":"https://openalex.org/W7092292774","doi":"https://doi.org/10.48550/arxiv.2510.14392","title":"FairBatching: Fairness-Aware Batch Formation for LLM Inference","display_name":"FairBatching: Fairness-Aware Batch Formation for LLM Inference","publication_year":2025,"publication_date":"2025-10-16","ids":{"openalex":"https://openalex.org/W7092292774","doi":"https://doi.org/10.48550/arxiv.2510.14392"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2510.14392","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.14392","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2510.14392","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Lyu, Hongtao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lyu, Hongtao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liu, Boyue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Boyue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wu, Mingyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Mingyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Chen, Haibo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Haibo","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T13187","display_name":"Diffusion and Search Dynamics","score":0.030400000512599945,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T13187","display_name":"Diffusion and Search Dynamics","score":0.030400000512599945,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.026499999687075615,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T13052","display_name":"Molecular Communication and Nanonetworks","score":0.017000000923871994,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.635200023651123},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6348999738693237},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.6107000112533569},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.5371000170707703},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.48330000042915344},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.46059998869895935},{"id":"https://openalex.org/keywords/quality-of-service","display_name":"Quality of service","score":0.41999998688697815},{"id":"https://openalex.org/keywords/queueing-theory","display_name":"Queueing theory","score":0.413100004196167},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.3903000056743622},{"id":"https://openalex.org/keywords/queue","display_name":"Queue","score":0.3732999861240387}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8266000151634216},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.635200023651123},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6348999738693237},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.6107000112533569},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.5371000170707703},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5055999755859375},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.48330000042915344},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.46059998869895935},{"id":"https://openalex.org/C5119721","wikidata":"https://www.wikidata.org/wiki/Q220501","display_name":"Quality of service","level":2,"score":0.41999998688697815},{"id":"https://openalex.org/C22684755","wikidata":"https://www.wikidata.org/wiki/Q847526","display_name":"Queueing theory","level":2,"score":0.413100004196167},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.4043999910354614},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.3903000056743622},{"id":"https://openalex.org/C160403385","wikidata":"https://www.wikidata.org/wiki/Q220543","display_name":"Queue","level":2,"score":0.3732999861240387},{"id":"https://openalex.org/C172658912","wikidata":"https://www.wikidata.org/wiki/Q661613","display_name":"Batch processing","level":2,"score":0.35899999737739563},{"id":"https://openalex.org/C162262903","wikidata":"https://www.wikidata.org/wiki/Q343527","display_name":"Allocator","level":2,"score":0.35030001401901245},{"id":"https://openalex.org/C176553487","wikidata":"https://www.wikidata.org/wiki/Q7855819","display_name":"Turnaround time","level":2,"score":0.33640000224113464},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.3314000070095062},{"id":"https://openalex.org/C2780898871","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Performance metric","level":2,"score":0.33000001311302185},{"id":"https://openalex.org/C2781395549","wikidata":"https://www.wikidata.org/wiki/Q4680762","display_name":"Adaptive sampling","level":3,"score":0.320499986410141},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.31529998779296875},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.31310001015663147},{"id":"https://openalex.org/C107568181","wikidata":"https://www.wikidata.org/wiki/Q5319000","display_name":"Dynamic priority scheduling","level":3,"score":0.3116999864578247},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.2849000096321106},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.2768999934196472},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.259799987077713},{"id":"https://openalex.org/C2780069185","wikidata":"https://www.wikidata.org/wiki/Q7977945","display_name":"Equivalence (formal languages)","level":2,"score":0.25760000944137573},{"id":"https://openalex.org/C55416958","wikidata":"https://www.wikidata.org/wiki/Q6206757","display_name":"Job shop scheduling","level":3,"score":0.2551000118255615},{"id":"https://openalex.org/C140781008","wikidata":"https://www.wikidata.org/wiki/Q1221081","display_name":"Service quality","level":3,"score":0.2531999945640564},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2513999938964844}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2510.14392","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.14392","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2510.14392","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.14392","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"model":[2],"(LLM)":[3],"inference":[4,115],"systems":[5],"face":[6],"a":[7,20,92,112,183],"fundamental":[8],"tension":[9],"between":[10,122],"minimizing":[11],"Time-to-First-Token":[12],"(TTFT)":[13],"latency":[14,206],"for":[15,30],"new":[16],"requests":[17],"and":[18,61,95,124,152,196,223],"maintaining":[19,213],"high,":[21],"steady":[22],"token":[23],"generation":[24],"rate":[25],"(low":[26],"Time-Per-Output-Token,":[27],"or":[28],"TPOT)":[29],"ongoing":[31],"requests.":[32],"Existing":[33],"stall-free":[34],"batching":[35],"schedulers":[36],"proposed":[37],"by":[38,207],"Sarathi,":[39],"while":[40,211],"effective":[41,190],"at":[42],"preventing":[43],"decode":[44,52,59,125,171],"stalls,":[45],"introduce":[46],"significant":[47],"computational":[48,139],"unfairness.":[49],"They":[50],"prioritize":[51],"tasks":[53,172],"excessively,":[54],"simultaneously":[55],"leading":[56],"to":[57,102,104,141,166,173,209],"underutilized":[58],"slack":[60],"unnecessary":[62],"prefill":[63,123,175],"queuing":[64],"delays,":[65],"which":[66,135],"collectively":[67],"degrade":[68],"the":[69,79,85,96,138,143,160],"system's":[70],"overall":[71,217],"quality":[72],"of":[73,82,88],"service":[74],"(QoS).":[75],"This":[76],"work":[77],"identifies":[78],"root":[80],"cause":[81],"this":[83],"unfairness:":[84],"non-monotonic":[86],"nature":[87],"Time-Between-Tokens":[89],"(TBT)":[90],"as":[91],"scheduling":[93],"metric":[94],"rigid":[97],"decode-prioritizing":[98,161],"policy":[99],"that":[100,117],"fails":[101],"adapt":[103],"dynamic":[105,153],"workload":[106],"bursts.":[107],"We":[108],"therefore":[109],"propose":[110],"FairBatching,":[111],"novel":[113,184],"LLM":[114],"scheduler":[116],"enforces":[118],"fair":[119,151],"resource":[120],"allocation":[121],"tasks.":[126],"It":[127],"features":[128],"an":[129],"adaptive":[130],"batch":[131,154],"capacity":[132,222],"determination":[133],"mechanism,":[134],"dynamically":[136],"adjusts":[137],"budget":[140],"improve":[142],"GPU":[144],"utilization":[145],"without":[146],"triggering":[147],"SLO":[148],"violations.":[149],"Its":[150],"formation":[155],"algorithm":[156],"breaks":[157],"away":[158],"from":[159,169],"paradigm,":[162],"allowing":[163],"computation":[164],"resources":[165],"be":[167],"reclaimed":[168],"bursting":[170],"serve":[174],"surges,":[176],"achieving":[177,216],"global":[178],"fairness.":[179],"Furthermore,":[180],"FairBatching":[181,201],"provides":[182],"load":[185],"estimation":[186],"method,":[187],"enabling":[188],"more":[189],"coordination":[191],"with":[192],"upper-level":[193],"schedulers.":[194],"Implemented":[195],"evaluated":[197],"on":[198],"realistic":[199],"traces,":[200],"significantly":[202],"reduces":[203],"TTFT":[204],"tail":[205],"up":[208],"2.29x":[210],"robustly":[212],"TPOT":[214],"SLOs,":[215],"20.0%":[218],"improvement":[219,225],"in":[220,226],"single-node":[221],"54.3%":[224],"cluster-level":[227],"capacity.":[228]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-18T00:00:00"}
