{"id":"https://openalex.org/W4415981663","doi":"https://doi.org/10.1145/3767295.3769328","title":"TokenFlow: Responsive LLM Text Streaming Serving under Request Burst via Preemptive Scheduling","display_name":"TokenFlow: Responsive LLM Text Streaming Serving under Request Burst via Preemptive Scheduling","publication_year":2026,"publication_date":"2026-04-24","ids":{"openalex":"https://openalex.org/W4415981663","doi":"https://doi.org/10.1145/3767295.3769328"},"language":"en","primary_location":{"id":"doi:10.1145/3767295.3769328","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3767295.3769328","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st European Conference on Computer Systems","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3767295.3769328","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Junyi Chen","orcid":"https://orcid.org/0009-0003-7397-6311"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Junyi Chen","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0003-7397-6311","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103235629","display_name":"Chen Du","orcid":"https://orcid.org/0000-0002-6572-9260"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chuheng Du","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0005-3465-4023","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088035128","display_name":"R. Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I162714631","display_name":"George Mason University","ror":"https://ror.org/02jqj7156","country_code":"US","type":"education","lineage":["https://openalex.org/I162714631"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Renyuan Liu","raw_affiliation_strings":["George Mason University, Fairfax, VA, USA"],"raw_orcid":"https://orcid.org/0000-0001-9710-6116","affiliations":[{"raw_affiliation_string":"George Mason University, Fairfax, VA, USA","institution_ids":["https://openalex.org/I162714631"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005026261","display_name":"Shuochao Yao","orcid":"https://orcid.org/0000-0002-4070-6345"},"institutions":[{"id":"https://openalex.org/I162714631","display_name":"George Mason University","ror":"https://ror.org/02jqj7156","country_code":"US","type":"education","lineage":["https://openalex.org/I162714631"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shuochao Yao","raw_affiliation_strings":["George Mason University, Fairfax, VA, USA"],"raw_orcid":"https://orcid.org/0000-0001-7446-1430","affiliations":[{"raw_affiliation_string":"George Mason University, Fairfax, VA, USA","institution_ids":["https://openalex.org/I162714631"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046959244","display_name":"Dandan Yan","orcid":"https://orcid.org/0009-0008-2220-0085"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dingtian Yan","raw_affiliation_strings":["China Telecom Corporation Limited Shanghai Branch, Shanghai, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-3112-8215","affiliations":[{"raw_affiliation_string":"China Telecom Corporation Limited Shanghai Branch, Shanghai, Shanghai, China","institution_ids":["https://openalex.org/I4210136246"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103392323","display_name":"Jiang Liao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiang Liao","raw_affiliation_strings":["China Telecom Corporation Limited Shanghai Branch, Shanghai, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0007-2128-3096","affiliations":[{"raw_affiliation_string":"China Telecom Corporation Limited Shanghai Branch, Shanghai, Shanghai, China","institution_ids":["https://openalex.org/I4210136246"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091362073","display_name":"Shengzhong Liu","orcid":"https://orcid.org/0000-0002-6338-852X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengzhong Liu","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-7643-7239","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082402770","display_name":"Fan Wu","orcid":"https://orcid.org/0000-0003-3615-1217"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fan Wu","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-0965-9058","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100428808","display_name":"Guihai Chen","orcid":"https://orcid.org/0000-0002-6934-1685"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guihai Chen","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-6934-1685","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.00533467,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"497","last_page":"513"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.6482999920845032,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.6482999920845032,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.14190000295639038,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.07859999686479568,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7648000121116638},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7046999931335449},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6970000267028809},{"id":"https://openalex.org/keywords/preemption","display_name":"Preemption","score":0.6122999787330627},{"id":"https://openalex.org/keywords/resource-consumption","display_name":"Resource consumption","score":0.4142000079154968},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.4092999994754791},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.3889999985694885},{"id":"https://openalex.org/keywords/token-passing","display_name":"Token passing","score":0.36970001459121704}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8804000020027161},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7648000121116638},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7046999931335449},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6970000267028809},{"id":"https://openalex.org/C206952183","wikidata":"https://www.wikidata.org/wiki/Q1193100","display_name":"Preemption","level":2,"score":0.6122999787330627},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.6093999743461609},{"id":"https://openalex.org/C2777480716","wikidata":"https://www.wikidata.org/wiki/Q23582796","display_name":"Resource consumption","level":2,"score":0.4142000079154968},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.4092999994754791},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.3889999985694885},{"id":"https://openalex.org/C115067241","wikidata":"https://www.wikidata.org/wiki/Q1639854","display_name":"Token passing","level":3,"score":0.36970001459121704},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3465000092983246},{"id":"https://openalex.org/C122141398","wikidata":"https://www.wikidata.org/wiki/Q5456330","display_name":"Fixed-priority pre-emptive scheduling","level":5,"score":0.3140999972820282},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.31130000948905945},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.31060001254081726},{"id":"https://openalex.org/C19012869","wikidata":"https://www.wikidata.org/wiki/Q578372","display_name":"Response time","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.29179999232292175},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.25099998712539673},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2502000033855438},{"id":"https://openalex.org/C2984822820","wikidata":"https://www.wikidata.org/wiki/Q1123036","display_name":"Processor scheduling","level":3,"score":0.2500999867916107}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3767295.3769328","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3767295.3769328","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st European Conference on Computer Systems","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2510.02758","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.02758","pdf_url":"https://arxiv.org/pdf/2510.02758","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2510.02758","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.02758","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.1145/3767295.3769328","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3767295.3769328","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st European Conference on Computer Systems","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Real-time":[0],"LLM":[1,32,67],"interactions":[2],"demand":[3],"streamed":[4],"token":[5,92,96,161],"generations,":[6],"where":[7],"text":[8,72],"tokens":[9],"are":[10],"progressively":[11],"generated":[12],"and":[13,25,44,53,79,95,106,112,126],"delivered":[14],"to":[15,49,117,140,156],"users":[16],"while":[17,99,150],"balancing":[18],"two":[19],"objectives:":[20],"responsiveness":[21],"(i.e.,":[22,28],"low":[23,54],"time-to-first-token)":[24],"steady":[26],"generation":[27],"required":[29],"time-between-tokens).":[30],"Standard":[31],"serving":[33,68],"systems":[34],"suffer":[35],"from":[36],"the":[37,110],"inflexibility":[38],"caused":[39],"by":[40,154],"non-preemptive":[41],"request":[42,55,59,77,119],"scheduling":[43,78],"reactive":[45],"memory":[46,108],"management,":[47],"leading":[48],"poor":[50],"resource":[51],"utilization":[52],"processing":[56],"parallelism":[57],"under":[58],"bursts.":[60],"Therefore,":[61],"we":[62],"present":[63],"TokenFlow,":[64],"a":[65],"novel":[66],"system":[69],"with":[70,115],"enhanced":[71],"streaming":[73],"performance":[74],"via":[75],"preemptive":[76],"proactive":[80],"key-value":[81],"(KV)":[82],"cache":[83,103],"management.":[84],"TokenFlow":[85,137],"dynamically":[86],"prioritizes":[87],"requests":[88],"based":[89],"on":[90,124],"real-time":[91],"buffer":[93],"occupancy":[94],"consumption":[97],"rate,":[98],"actively":[100],"transferring":[101],"KV":[102],"between":[104],"GPU":[105],"CPU":[107],"in":[109],"background":[111],"overlapping":[113],"I/O":[114],"computation":[116],"minimize":[118],"preemption":[120],"overhead.":[121],"Extensive":[122],"experiments":[123],"Llama3-8B":[125],"Qwen2.5-32B":[127],"across":[128],"multiple":[129],"GPUs":[130],"(RTX":[131],"4090,":[132],"A6000,":[133],"H200)":[134],"demonstrate":[135],"that":[136],"achieves":[138],"up":[139,155],"82.5%":[141],"higher":[142],"effective":[143],"throughput":[144],"(accounting":[145],"for":[146],"actual":[147],"user":[148],"consumption)":[149],"reducing":[151],"P99":[152],"TTFT":[153],"80.2%,":[157],"without":[158],"degrading":[159],"overall":[160],"throughput.":[162]},"counts_by_year":[],"updated_date":"2026-04-26T06:01:38.667478","created_date":"2025-10-10T00:00:00"}
