{"id":"https://openalex.org/W7162791716","doi":"https://doi.org/10.48550/arxiv.2605.29639","title":"RTP-LLM: High-Performance Alibaba LLM Inference Engine","display_name":"RTP-LLM: High-Performance Alibaba LLM Inference Engine","publication_year":2026,"publication_date":"2026-05-28","ids":{"openalex":"https://openalex.org/W7162791716","doi":"https://doi.org/10.48550/arxiv.2605.29639"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.29639","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.29639","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.29639","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110780214","display_name":"Boyu Tan","orcid":"https://orcid.org/0000-0001-6345-0058"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Boyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078422343","display_name":"Jiarui Guo","orcid":"https://orcid.org/0000-0003-1551-7646"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Jiarui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137376967","display_name":"Zongwei Lv","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lv, Zongwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000640882","display_name":"Hanbo Sun","orcid":"https://orcid.org/0000-0002-7875-2064"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Hanbo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137352653","display_name":"Tong Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Tong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137353235","display_name":"Kan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Kan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137374226","display_name":"Xinfei Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Xinfei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137388749","display_name":"Zetao Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Zetao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137389682","display_name":"Yaxin Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Yaxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137335708","display_name":"Chi Zhang","orcid":"https://orcid.org/0009-0008-9534-1890"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113622035","display_name":"Jianning Zhang","orcid":"https://orcid.org/0000-0001-5216-1496"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jianning","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137372350","display_name":"Xi Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Xi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137370160","display_name":"Wei Emma Zhang","orcid":"https://orcid.org/0000-0002-1960-4475"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137383938","display_name":"Bo Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Bo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137382297","display_name":"Silu Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Silu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137332976","display_name":"Xiyu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100901450","display_name":"Na He","orcid":"https://orcid.org/0000-0002-0798-5709"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Na","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023686830","display_name":"Yinghao Yu","orcid":"https://orcid.org/0000-0002-2744-845X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Yinghao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137383229","display_name":"Wending Bao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bao, Wending","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137338599","display_name":"Guiyang Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Guiyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137340062","display_name":"Yuxing Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Yuxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103910670","display_name":"Juncheng Yin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Juncheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137324127","display_name":"Nan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Nan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137362356","display_name":"Lin Yang","orcid":"https://orcid.org/0000-0003-0282-6116"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Lin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032753281","display_name":"Zechao Zhang","orcid":"https://orcid.org/0009-0003-8148-0148"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zechao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137330475","display_name":"Lu Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Lu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137378485","display_name":"Guoding Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Guoding","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137397101","display_name":"Tao Lan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lan, Tao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137335679","display_name":"Lin Qu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qu, Lin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":29,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.2387000024318695,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.2387000024318695,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.08550000190734863,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.05860000103712082,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7002000212669373},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.6140999794006348},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5188000202178955},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.5145999789237976},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.49149999022483826},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.3806000053882599},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.3783999979496002}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8199999928474426},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7002000212669373},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.6140999794006348},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5188000202178955},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.5145999789237976},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.49149999022483826},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.44760000705718994},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3968000113964081},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.3806000053882599},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.3783999979496002},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.36820000410079956},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.3384000062942505},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3287000060081482},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C167713795","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"Smart Cache","level":5,"score":0.31310001015663147},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.28349998593330383},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2757999897003174},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.29639","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.29639","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.29639","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.29639","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.5681874752044678,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4,108],"revolutionized":[5],"AI":[6],"applications,":[7],"but":[8],"deploying":[9],"them":[10],"at":[11],"scale":[12],"presents":[13],"significant":[14],"challenges.":[15],"We":[16],"present":[17],"RTP-LLM,":[18],"a":[19,181],"high-performance":[20],"inference":[21],"engine":[22],"for":[23,97,184],"industrial-scale":[24],"LLM":[25,186],"deployment,":[26],"successfully":[27],"deployed":[28],"across":[29,102],"Alibaba":[30],"Group":[31],"serving":[32],"over":[33],"100":[34],"million":[35],"users.":[36],"RTP-LLM":[37,79],"addresses":[38],"fundamental":[39],"bottlenecks":[40],"through":[41],"integrated":[42],"design.":[43],"It":[44],"optimizes":[45],"model":[46,104,132],"loading":[47,133],"via":[48],"file-order-driven":[49],"I/O":[50],"and":[51,91,115,129,150,157,161,176],"parallel":[52],"I/O-communication":[53],"overlapping.":[54],"The":[55,121],"Prefill-Decode":[56],"Disaggregation":[57],"architecture":[58,175],"decouples":[59],"compute-intensive":[60],"prefill":[61],"from":[62],"memory-bound":[63],"decode":[64],"phases,":[65],"combined":[66],"with":[67,95,140,166],"hierarchical":[68],"multi-tiered":[69],"KV":[70,88],"cache":[71,75,89,142],"management":[72],"enabling":[73],"efficient":[74],"reuse.":[76],"In":[77],"addition,":[78],"incorporates":[80],"modular":[81],"speculative":[82,155],"decoding":[83,156],"supporting":[84],"multiple":[85],"algorithms,":[86],"adaptive":[87],"quantization,":[90],"decoupled":[92],"multimodal":[93,158],"processing,":[94],"support":[96],"multi-level":[98],"parallelism.":[99],"Comprehensive":[100],"evaluations":[101],"diverse":[103],"architectures":[105],"(8B-235B":[106],"parameters)":[107],"been":[109],"conducted,":[110],"where":[111],"both":[112],"controlled":[113],"benchmarks":[114],"real":[116],"production":[117,146],"workloads":[118],"are":[119],"used.":[120],"results":[122],"demonstrate":[123],"RTP-LLM's":[124,173],"superior":[125],"performance":[126],"against":[127],"vLLM":[128],"SGLang:":[130],"4.7x-6.3x":[131],"speedup,":[134],"35-37%":[135],"TTFT":[136,168],"P95":[137],"latency":[138,164],"reduction":[139,165],"215%":[141],"reuse":[143],"improvement":[144,169],"in":[145,154,170],"traffic":[147],"scheduling,":[148],"1.12x-2.48x":[149],"1.86x-2.52x":[151],"throughput":[152],"improvements":[153],"inference,":[159],"respectively,":[160],"35-40%":[162],"batch":[163],"1.9x-3.0x":[167],"quantized":[171],"inference.":[172],"production-proven":[174],"open-source":[177],"availability":[178],"make":[179],"it":[180],"comprehensive":[182],"solution":[183],"industrial":[185],"deployment.":[187]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-30T00:00:00"}
