{"id":"https://openalex.org/W7155409580","doi":"https://doi.org/10.48550/arxiv.2604.20503","title":"FASER: Fine-Grained Phase Management for Speculative Decoding in Dynamic LLM Serving","display_name":"FASER: Fine-Grained Phase Management for Speculative Decoding in Dynamic LLM Serving","publication_year":2026,"publication_date":"2026-04-22","ids":{"openalex":"https://openalex.org/W7155409580","doi":"https://doi.org/10.48550/arxiv.2604.20503"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.20503","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.20503","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134436327","display_name":"Wenyan Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Wenyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134410470","display_name":"Chengzhi Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Chengzhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033311175","display_name":"Yanying Lin","orcid":"https://orcid.org/0000-0002-4809-9543"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Yanying","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5083116172","display_name":"Dmitrii Ustiugov","orcid":"https://orcid.org/0000-0003-3156-010X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ustiugov, Dmitrii","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.13600000739097595,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.13600000739097595,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.12080000340938568,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11598","display_name":"Internet Traffic Analysis and Secure E-voting","score":0.1145000010728836,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7329999804496765},{"id":"https://openalex.org/keywords/serialization","display_name":"Serialization","score":0.578499972820282},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5228000283241272},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5092999935150146},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.4934999942779541},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.46880000829696655},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.460099995136261},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4359999895095825}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8543000221252441},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7329999804496765},{"id":"https://openalex.org/C52723943","wikidata":"https://www.wikidata.org/wiki/Q1127410","display_name":"Serialization","level":2,"score":0.578499972820282},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5228000283241272},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5092999935150146},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.4934999942779541},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.46880000829696655},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.460099995136261},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.4377000033855438},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4359999895095825},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.421999990940094},{"id":"https://openalex.org/C19275194","wikidata":"https://www.wikidata.org/wiki/Q222903","display_name":"Multiplexing","level":2,"score":0.4002000093460083},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3926999866962433},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.3873000144958496},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.36579999327659607},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.36559998989105225},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3422999978065491},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.3125},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.311599999666214},{"id":"https://openalex.org/C50661577","wikidata":"https://www.wikidata.org/wiki/Q901831","display_name":"Time-division multiplexing","level":3,"score":0.29820001125335693},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2946000099182129},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.2921999990940094},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C111498074","wikidata":"https://www.wikidata.org/wiki/Q173326","display_name":"Formal verification","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C44280652","wikidata":"https://www.wikidata.org/wiki/Q104837","display_name":"Phase (matter)","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C164155591","wikidata":"https://www.wikidata.org/wiki/Q2067766","display_name":"Satisfiability modulo theories","level":2,"score":0.26089999079704285}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.20503","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.20503","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/8","score":0.49457353353500366,"display_name":"Decent work and economic growth"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Speculative":[0],"decoding":[1],"(SD)":[2],"is":[3,169],"a":[4,28,111,135],"widely":[5],"used":[6],"approach":[7,30],"for":[8,41,82,131],"accelerating":[9],"decode-heavy":[10],"LLM":[11],"inference":[12,16,65],"workloads.":[13],"While":[14],"online":[15,64],"workloads":[17],"are":[18,24],"highly":[19],"dynamic,":[20],"existing":[21],"SD":[22,32,117],"systems":[23,57],"rigid":[25],"and":[26,45,52,138,190],"take":[27],"coarse-grained":[29],"to":[31,62,160,188,195,198],"management.":[33,119],"They":[34],"typically":[35],"set":[36],"the":[37,47,50,75,79,83,102,128,147,153,164],"speculative":[38,129],"token":[39],"length":[40,130],"an":[42],"entire":[43,84],"batch":[44,137],"serialize":[46],"execution":[48],"of":[49,143],"draft":[51,76,165],"verification":[53,80,103,148,154],"phases.":[54],"Consequently,":[55],"these":[56],"fall":[58],"short":[59],"at":[60],"adapting":[61],"volatile":[63],"traffic.":[66],"Under":[67],"low":[68],"load,":[69,94],"they":[70,95],"exhibit":[71],"prolonged":[72],"latency":[73,192],"because":[74],"phase":[77,81,118,155],"blocks":[78],"batch,":[85],"leaving":[86],"GPU":[87,106],"computing":[88],"resources":[89],"underutilized.":[90],"Conversely,":[91],"under":[92],"high":[93],"waste":[96,124],"computation":[97],"on":[98],"rejected":[99,144],"tokens":[100,145],"during":[101],"phase,":[104],"overloading":[105],"resources.":[107],"We":[108],"introduce":[109],"FASER,":[110],"novel":[112],"system":[113],"that":[114],"features":[115],"fine-grained":[116,172],"First,":[120],"FASER":[121,151,180],"minimizes":[122],"computational":[123],"by":[125,139,186,193],"dynamically":[126],"adjusting":[127],"each":[132],"request":[133],"within":[134],"continuous":[136],"performing":[140],"early":[141],"pruning":[142],"inside":[146],"phase.":[149,166],"Second,":[150],"breaks":[152],"into":[156],"frontiers,":[157],"or":[158],"chunks,":[159],"overlap":[161,168],"them":[162],"with":[163,175],"This":[167],"achieved":[170],"via":[171],"spatial":[173],"multiplexing":[174],"minimal":[176],"resource":[177],"interference.":[178],"Our":[179],"prototype":[181],"in":[182],"vLLM":[183],"improves":[184],"throughput":[185],"up":[187,194],"53%":[189],"reduces":[191],"1.92$\\times$":[196],"compared":[197],"state-of-the-art":[199],"systems.":[200]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-24T00:00:00"}
