{"id":"https://openalex.org/W4414736234","doi":"https://doi.org/10.1145/3731569.3764834","title":"PrefillOnly: An Inference Engine for Prefill-only Workloads in Large Language Model Applications","display_name":"PrefillOnly: An Inference Engine for Prefill-only Workloads in Large Language Model Applications","publication_year":2025,"publication_date":"2025-10-01","ids":{"openalex":"https://openalex.org/W4414736234","doi":"https://doi.org/10.1145/3731569.3764834"},"language":"en","primary_location":{"id":"doi:10.1145/3731569.3764834","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731569.3764834","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3731569.3764834","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036272233","display_name":"Kuntai Du","orcid":"https://orcid.org/0000-0002-3964-4079"},"institutions":[{"id":"https://openalex.org/I886932462","display_name":"Foster-Miller (United States)","ror":"https://ror.org/01wm02973","country_code":"US","type":"company","lineage":["https://openalex.org/I886932462"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Kuntai Du","raw_affiliation_strings":["University of Chicago / TensorMesh, Inc., Foster City, California, USA"],"affiliations":[{"raw_affiliation_string":"University of Chicago / TensorMesh, Inc., Foster City, California, USA","institution_ids":["https://openalex.org/I886932462"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100412550","display_name":"Bowen Wang","orcid":"https://orcid.org/0009-0008-3818-2045"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bowen Wang","raw_affiliation_strings":["Sky Computing Lab, Tsinghua University / UC Berkeley, Berkeley, California, USA"],"affiliations":[{"raw_affiliation_string":"Sky Computing Lab, Tsinghua University / UC Berkeley, Berkeley, California, USA","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101752464","display_name":"Chen Zhang","orcid":"https://orcid.org/0000-0001-9045-9269"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chen Zhang","raw_affiliation_strings":["Sky Computing Lab, Tsinghua University / UC Berkeley, Berkeley, California, USA"],"affiliations":[{"raw_affiliation_string":"Sky Computing Lab, Tsinghua University / UC Berkeley, Berkeley, California, USA","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110827078","display_name":"Yi\u2010Ming Cheng","orcid":null},"institutions":[{"id":"https://openalex.org/I40347166","display_name":"University of Chicago","ror":"https://ror.org/024mw5h28","country_code":"US","type":"education","lineage":["https://openalex.org/I40347166"]},{"id":"https://openalex.org/I886932462","display_name":"Foster-Miller (United States)","ror":"https://ror.org/01wm02973","country_code":"US","type":"company","lineage":["https://openalex.org/I886932462"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yiming Cheng","raw_affiliation_strings":["University of Chicago / TensorMesh, Inc., Foster City, California, USA","University of Chicago, Chicago, Illinois, USA"],"affiliations":[{"raw_affiliation_string":"University of Chicago / TensorMesh, Inc., Foster City, California, USA","institution_ids":["https://openalex.org/I886932462"]},{"raw_affiliation_string":"University of Chicago, Chicago, Illinois, USA","institution_ids":["https://openalex.org/I40347166"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100771334","display_name":"Qing Lan","orcid":"https://orcid.org/0009-0007-3215-4652"},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Qing Lan","raw_affiliation_strings":["LinkedIn, Mountain View, California, USA"],"affiliations":[{"raw_affiliation_string":"LinkedIn, Mountain View, California, USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109706978","display_name":"H. S. Sang","orcid":"https://orcid.org/0009-0008-7388-7243"},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hejian Sang","raw_affiliation_strings":["LinkedIn, Mountain View, California, USA"],"affiliations":[{"raw_affiliation_string":"LinkedIn, Mountain View, California, USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004850046","display_name":"Yihua Cheng","orcid":"https://orcid.org/0009-0006-3924-6886"},"institutions":[{"id":"https://openalex.org/I40347166","display_name":"University of Chicago","ror":"https://ror.org/024mw5h28","country_code":"US","type":"education","lineage":["https://openalex.org/I40347166"]},{"id":"https://openalex.org/I886932462","display_name":"Foster-Miller (United States)","ror":"https://ror.org/01wm02973","country_code":"US","type":"company","lineage":["https://openalex.org/I886932462"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yihua Cheng","raw_affiliation_strings":["University of Chicago / TensorMesh, Inc., Foster City, California, USA","University of Chicago, Chicago, Illinois, USA"],"affiliations":[{"raw_affiliation_string":"University of Chicago / TensorMesh, Inc., Foster City, California, USA","institution_ids":["https://openalex.org/I886932462"]},{"raw_affiliation_string":"University of Chicago, Chicago, Illinois, USA","institution_ids":["https://openalex.org/I40347166"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058454592","display_name":"Jiayi Yao","orcid":"https://orcid.org/0000-0002-8588-4356"},"institutions":[{"id":"https://openalex.org/I886932462","display_name":"Foster-Miller (United States)","ror":"https://ror.org/01wm02973","country_code":"US","type":"company","lineage":["https://openalex.org/I886932462"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiayi Yao","raw_affiliation_strings":["University of Chicago / TensorMesh, Inc., Foster City, California, USA"],"affiliations":[{"raw_affiliation_string":"University of Chicago / TensorMesh, Inc., Foster City, California, USA","institution_ids":["https://openalex.org/I886932462"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027378038","display_name":"Xiaoxuan Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210151627","display_name":"National Energy Research Scientific Computing Center","ror":"https://ror.org/05v3mvq14","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521","https://openalex.org/I4210151627"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaoxuan Liu","raw_affiliation_strings":["Sky Computing Lab, UC Berkeley, Berkeley, California, USA"],"affiliations":[{"raw_affiliation_string":"Sky Computing Lab, UC Berkeley, Berkeley, California, USA","institution_ids":["https://openalex.org/I95457486","https://openalex.org/I4210151627"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112905911","display_name":"Yong Qiao","orcid":"https://orcid.org/0009-0003-3651-6973"},"institutions":[{"id":"https://openalex.org/I4210151627","display_name":"National Energy Research Scientific Computing Center","ror":"https://ror.org/05v3mvq14","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521","https://openalex.org/I4210151627"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yifan Qiao","raw_affiliation_strings":["Sky Computing Lab, UC Berkeley, Berkeley, California, USA"],"affiliations":[{"raw_affiliation_string":"Sky Computing Lab, UC Berkeley, Berkeley, California, USA","institution_ids":["https://openalex.org/I95457486","https://openalex.org/I4210151627"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041920173","display_name":"Ion Stoica","orcid":"https://orcid.org/0000-0002-5373-0088"},"institutions":[{"id":"https://openalex.org/I4210151627","display_name":"National Energy Research Scientific Computing Center","ror":"https://ror.org/05v3mvq14","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521","https://openalex.org/I4210151627"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ion Stoica","raw_affiliation_strings":["Sky Computing Lab, UC Berkeley, Berkeley, California, USA"],"affiliations":[{"raw_affiliation_string":"Sky Computing Lab, UC Berkeley, Berkeley, California, USA","institution_ids":["https://openalex.org/I95457486","https://openalex.org/I4210151627"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103258769","display_name":"Junchen Jiang","orcid":"https://orcid.org/0000-0002-6877-1683"},"institutions":[{"id":"https://openalex.org/I886932462","display_name":"Foster-Miller (United States)","ror":"https://ror.org/01wm02973","country_code":"US","type":"company","lineage":["https://openalex.org/I886932462"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Junchen Jiang","raw_affiliation_strings":["University of Chicago / TensorMesh, Inc., Foster City, California, USA"],"affiliations":[{"raw_affiliation_string":"University of Chicago / TensorMesh, Inc., Foster City, California, USA","institution_ids":["https://openalex.org/I886932462"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5036272233"],"corresponding_institution_ids":["https://openalex.org/I886932462"],"apc_list":null,"apc_paid":null,"fwci":2.4849,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.91669555,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"399","last_page":"414"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10703","display_name":"Business Process Modeling and Analysis","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.991599977016449,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7706000208854675},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.6843000054359436},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.597599983215332},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5367000102996826},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.519599974155426},{"id":"https://openalex.org/keywords/inference-engine","display_name":"Inference engine","score":0.49639999866485596},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.4860999882221222},{"id":"https://openalex.org/keywords/memory-footprint","display_name":"Memory footprint","score":0.40869998931884766}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8051999807357788},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7706000208854675},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.6843000054359436},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.597599983215332},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5367000102996826},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.519599974155426},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.49639999866485596},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.4860999882221222},{"id":"https://openalex.org/C74912251","wikidata":"https://www.wikidata.org/wiki/Q6815727","display_name":"Memory footprint","level":2,"score":0.40869998931884766},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.3959999978542328},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.36489999294281006},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.3382999897003174},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.33399999141693115},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.32510000467300415},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3127000033855438},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3077000081539154},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.2971000075340271},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2754000127315521},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27160000801086426},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.25870001316070557}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3731569.3764834","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731569.3764834","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3731569.3764834","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731569.3764834","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W95608104","https://openalex.org/W2913507876","https://openalex.org/W2965289829","https://openalex.org/W2966139567","https://openalex.org/W3095319910","https://openalex.org/W4205983429","https://openalex.org/W4225411436","https://openalex.org/W4254154714","https://openalex.org/W4254240460","https://openalex.org/W4382319938","https://openalex.org/W4387321091","https://openalex.org/W4387835442","https://openalex.org/W4389519153","https://openalex.org/W4392736881","https://openalex.org/W4401211813","https://openalex.org/W4401834466","https://openalex.org/W4404088579","https://openalex.org/W4404343279","https://openalex.org/W4405840816","https://openalex.org/W4407185545","https://openalex.org/W4408940210"],"related_works":[],"abstract_inverted_index":{"Besides":[0],"typical":[1],"generative":[2],"applications,":[3],"like":[4],"ChatGPT,":[5],"GitHub":[6],"Copilot,":[7],"and":[8,29,101,147,215],"Cursor,":[9],"we":[10,88],"observe":[11],"an":[12,52],"emerging":[13,37],"trend":[14],"that":[15,41,96,155],"LLMs":[16],"are":[17],"increasingly":[18],"used":[19],"in":[20],"traditional":[21],"discriminative":[22],"tasks,":[23],"such":[24,158,196],"as":[25,62,159,197],"recommendation,":[26],"credit":[27],"verification,":[28],"data":[30],"labeling.":[31],"The":[32],"key":[33],"characteristic":[34],"of":[35,56,82,108,126,134,144,183],"these":[36],"use":[38],"cases":[39],"is":[40,169],"the":[42,79,91,98,106,123,128,140,166,178,213],"LLM":[43,69,93,145],"generates":[44,114],"only":[45,115,119,127],"a":[46,63],"single":[47],"output":[48,73,167],"token,":[49,117],"rather":[50,132,171],"than":[51,133,172],"arbitrarily":[53],"long":[54,150],"sequence":[55],"tokens.":[57],"We":[58],"refer":[59],"to":[60,77,121,205],"this":[61,86],"prefill-only":[64,83,109,185],"workload.":[65],"However,":[66],"since":[67,112],"existing":[68],"engines":[70],"assume":[71],"arbitrary":[72],"lengths,":[74],"they":[75],"fail":[76],"leverage":[78],"unique":[80],"properties":[81,107],"workloads.":[84,110],"In":[85],"paper,":[87],"present":[89],"PrefillOnly,":[90],"first":[92],"inference":[94,99,146],"engine":[95],"improves":[97],"throughput":[100],"latency":[102],"by":[103],"fully":[104],"embracing":[105],"First,":[111],"it":[113,188],"one":[116],"PrefillOnly":[118,174,201],"needs":[120],"store":[122],"KV":[124,161],"cache":[125,162],"last":[129],"computed":[130],"layer,":[131],"all":[135],"layers.":[136],"This":[137,190],"drastically":[138],"reduces":[139],"GPU":[141],"memory":[142],"footprint":[143],"allows":[148],"handling":[149],"inputs":[151],"without":[152,211],"using":[153],"solutions":[154],"reduce":[156],"throughput,":[157],"cross-GPU":[160],"parallelization.":[163],"Second,":[164],"because":[165],"length":[168],"fixed,":[170],"arbitrary,":[173],"can":[175,202],"precisely":[176],"determine":[177],"job":[179],"completion":[180],"time":[181],"(JCT)":[182],"each":[184],"request":[186],"before":[187],"starts.":[189],"enables":[191],"efficient":[192],"JCT-aware":[193],"scheduling":[194],"policies":[195],"shortest":[198],"prefill":[199],"first.":[200],"process":[203],"up":[204],"4\u00d7":[206],"larger":[207],"queries":[208],"per":[209],"second":[210],"inflating":[212],"average":[214],"P99":[216],"latency.":[217]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
