{"id":"https://openalex.org/W7154723983","doi":"https://doi.org/10.48550/arxiv.2604.15039","title":"Prefill-as-a-Service: KVCache of Next-Generation Models Could Go Cross-Datacenter","display_name":"Prefill-as-a-Service: KVCache of Next-Generation Models Could Go Cross-Datacenter","publication_year":2026,"publication_date":"2026-04-16","ids":{"openalex":"https://openalex.org/W7154723983","doi":"https://doi.org/10.48550/arxiv.2604.15039"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.15039","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15039","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.15039","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132782270","display_name":"Ruoyu Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Qin, Ruoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133883584","display_name":"Weiran He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Weiran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133880947","display_name":"Yaoyu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yaoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133842736","display_name":"Zheming Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zheming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133860819","display_name":"Xinran Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Xinran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133857009","display_name":"Yongwei Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yongwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133852224","display_name":"Weimin Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Weimin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133912329","display_name":"Mingxing Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Mingxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5132782270"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.5647000074386597,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.5647000074386597,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10138","display_name":"Network Traffic and Congestion Control","score":0.11540000140666962,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.09740000218153,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.7164000272750854},{"id":"https://openalex.org/keywords/heterogeneous-network","display_name":"Heterogeneous network","score":0.6452999711036682},{"id":"https://openalex.org/keywords/remote-direct-memory-access","display_name":"Remote direct memory access","score":0.6158999800682068},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.5478000044822693},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.49970000982284546},{"id":"https://openalex.org/keywords/homogeneous","display_name":"Homogeneous","score":0.46700000762939453},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.40950000286102295},{"id":"https://openalex.org/keywords/myrinet","display_name":"Myrinet","score":0.37459999322891235},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.3693999946117401}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7506999969482422},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.7164000272750854},{"id":"https://openalex.org/C158207573","wikidata":"https://www.wikidata.org/wiki/Q5747224","display_name":"Heterogeneous network","level":4,"score":0.6452999711036682},{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.6158999800682068},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.5609999895095825},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.5478000044822693},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.49970000982284546},{"id":"https://openalex.org/C66882249","wikidata":"https://www.wikidata.org/wiki/Q169336","display_name":"Homogeneous","level":2,"score":0.46700000762939453},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4650000035762787},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.40950000286102295},{"id":"https://openalex.org/C2780601250","wikidata":"https://www.wikidata.org/wiki/Q1863181","display_name":"Myrinet","level":3,"score":0.37459999322891235},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3693999946117401},{"id":"https://openalex.org/C172173386","wikidata":"https://www.wikidata.org/wiki/Q79984","display_name":"Ethernet","level":2,"score":0.35749998688697815},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3474000096321106},{"id":"https://openalex.org/C89377073","wikidata":"https://www.wikidata.org/wiki/Q1171224","display_name":"Indirection","level":2,"score":0.3025999963283539},{"id":"https://openalex.org/C65813073","wikidata":"https://www.wikidata.org/wiki/Q1622420","display_name":"High availability","level":2,"score":0.2964000105857849},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.2903999984264374},{"id":"https://openalex.org/C557945733","wikidata":"https://www.wikidata.org/wiki/Q389772","display_name":"Data transmission","level":2,"score":0.28439998626708984},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C172430144","wikidata":"https://www.wikidata.org/wiki/Q17111997","display_name":"Symmetric multiprocessor system","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C103987645","wikidata":"https://www.wikidata.org/wiki/Q985806","display_name":"Network interface","level":3,"score":0.25949999690055847}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.15039","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15039","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.15039","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15039","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.4127695858478546,"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Prefill-decode":[0],"(PD)":[1],"disaggregation":[2],"has":[3],"become":[4],"the":[5,133,170,176],"standard":[6],"architecture":[7,120],"for":[8,143],"large-scale":[9],"LLM":[10],"serving,":[11],"but":[12],"in":[13],"practice":[14],"its":[15],"deployment":[16,48,206],"boundary":[17],"is":[18],"still":[19,104],"determined":[20],"by":[21],"KVCache":[22,31,57,61,67,135,149],"transfer.":[23],"In":[24,193],"conventional":[25],"dense-attention":[26],"models,":[27],"prefill":[28,35,101,125,129,185],"generates":[29],"huge":[30],"traffics":[32],"that":[33,98,121,172],"keep":[34],"and":[36,49,91,110,131,163,186,212],"decode":[37,187],"tightly":[38],"coupled":[39,191],"within":[40],"a":[41,117,194,203,218],"single":[42],"high-bandwidth":[43],"network":[44],"domain,":[45],"limiting":[46],"heterogeneous":[47,72,173,205],"resource":[50],"elasticity.":[51],"Recent":[52],"hybrid-attention":[53],"architectures":[54],"substantially":[55],"reduce":[56],"size,":[58],"making":[59],"cross-cluster":[60],"transport":[62],"increasingly":[63],"plausible.":[64],"However,":[65],"smaller":[66],"alone":[68],"does":[69],"not":[70],"make":[71],"cross-datacenter":[73,118,234],"PD":[74,141,220],"serving":[75,119,210],"practical:":[76],"real":[77],"workloads":[78],"remain":[79],"bursty,":[80],"request":[81,165],"lengths":[82],"are":[83,88],"highly":[84],"skewed,":[85],"prefix":[86],"caches":[87],"unevenly":[89],"distributed,":[90],"inter-cluster":[92],"bandwidth":[93],"fluctuates.":[94],"A":[95],"naive":[96],"design":[97,168],"fully":[99],"externalizes":[100],"can":[102],"therefore":[103],"suffer":[105],"from":[106],"congestion,":[107],"unstable":[108],"queueing,":[109],"poor":[111],"utilization.":[112],"We":[113],"present":[114],"Prefill-as-a-Service":[115],"(PrfaaS),":[116],"selectively":[122],"offloads":[123],"long-context":[124],"to":[126,139],"standalone,":[127],"compute-dense":[128],"clusters":[130,142],"transfers":[132],"resulting":[134],"over":[136],"commodity":[137],"Ethernet":[138],"local":[140],"decode.":[144],"Rather":[145],"than":[146,217],"treating":[147],"reduced":[148],"as":[150],"sufficient,":[151],"PrfaaS":[152],"combines":[153],"model-side":[154],"KV":[155],"efficiency":[156],"with":[157,222],"system-side":[158],"selective":[159],"offloading,":[160],"bandwidth-aware":[161],"scheduling,":[162],"cache-aware":[164],"placement.":[166],"This":[167],"removes":[169],"requirement":[171],"accelerators":[174],"share":[175],"same":[177],"low-latency":[178],"RDMA":[179],"fabric,":[180],"enabling":[181],"independent":[182],"scaling":[183],"of":[184],"capacity":[188],"across":[189],"loosely":[190],"clusters.":[192],"case":[195],"study":[196],"using":[197],"an":[198],"internal":[199],"1T-parameter":[200],"hybrid":[201],"model,":[202],"PrfaaS-augmented":[204],"achieves":[207],"54%":[208],"higher":[209],"throughput":[211,225],"64%":[213],"lower":[214],"P90":[215],"TTFT":[216],"homogeneous":[219],"baseline,":[221],"approximately":[223],"15%":[224],"gain":[226],"at":[227],"equal":[228],"cost,":[229],"while":[230],"consuming":[231],"only":[232],"modest":[233],"bandwidth.":[235]},"counts_by_year":[],"updated_date":"2026-04-24T06:01:54.638496","created_date":"2026-04-18T00:00:00"}
