{"id":"https://openalex.org/W7083835719","doi":"https://doi.org/10.48550/arxiv.2509.24381","title":"RServe: Overlapping Encoding and Prefill for Efficient LMM Inference","display_name":"RServe: Overlapping Encoding and Prefill for Efficient LMM Inference","publication_year":2025,"publication_date":"2025-09-29","ids":{"openalex":"https://openalex.org/W7083835719","doi":"https://doi.org/10.48550/arxiv.2509.24381"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2509.24381","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.24381","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2509.24381","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Guo, Tianyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Guo, Tianyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xu, Tianming","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Tianming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Xianjie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Xianjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Junru","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Junru","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xiao, Nong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao, Nong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Zhang, Xianwei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xianwei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12680","display_name":"Cultural and Mythological Studies","score":0.11540000140666962,"subfield":{"id":"https://openalex.org/subfields/1200","display_name":"General Arts and Humanities"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12680","display_name":"Cultural and Mythological Studies","score":0.11540000140666962,"subfield":{"id":"https://openalex.org/subfields/1200","display_name":"General Arts and Humanities"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13845","display_name":"Classical Studies and Legal History","score":0.07150000333786011,"subfield":{"id":"https://openalex.org/subfields/1202","display_name":"History"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10165","display_name":"Classical Antiquity Studies","score":0.06599999964237213,"subfield":{"id":"https://openalex.org/subfields/3314","display_name":"Anthropology"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.718999981880188},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6787999868392944},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6466000080108643},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.6202999949455261},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6172999739646912},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5697000026702881},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.506600022315979},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3382999897003174}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8446000218391418},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.718999981880188},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6787999868392944},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6466000080108643},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.6202999949455261},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6172999739646912},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5697000026702881},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.506600022315979},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3822999894618988},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3382999897003174},{"id":"https://openalex.org/C2776834041","wikidata":"https://www.wikidata.org/wiki/Q25346349","display_name":"Execution model","level":2,"score":0.3249000012874603},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3068999946117401},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3046000003814697},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.28619998693466187},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.28040000796318054},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2759999930858612},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.26899999380111694},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C2780186347","wikidata":"https://www.wikidata.org/wiki/Q11414","display_name":"Subnetwork","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2509.24381","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.24381","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2509.24381","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.24381","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.4285854995250702,"id":"https://metadata.un.org/sdg/8","display_name":"Decent work and economic growth"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"multimodal":[1,11,155],"models":[2,22],"(LMMs)":[3],"typically":[4],"employ":[5],"an":[6,108],"encoding":[7,47,71,142,156],"module":[8,48,72,143],"to":[9,20,34,54,87,122,179,218,225],"transform":[10],"data":[12,60],"inputs":[13],"into":[14],"embeddings,":[15],"which":[16],"are":[17],"then":[18],"fed":[19],"language":[21,51,145,163],"for":[23],"further":[24],"processing.":[25],"However,":[26],"efficiently":[27,113],"serving":[28,43,230],"LMMs":[29,208],"remains":[30],"highly":[31],"challenging":[32],"due":[33],"the":[35,46,50,70,74,103,137,141,158,162,197],"inherent":[36],"complexity":[37],"of":[38,80,140,161,199,216],"their":[39],"inference":[40,110],"pipelines.":[41,118,203],"Traditional":[42],"engines":[44],"co-locate":[45],"and":[49,58,96,116,126,132,144,176,201],"model,":[52,75,146],"leading":[53],"significant":[55],"resource":[56],"interference":[57],"tight":[59],"dependency.":[61],"Recent":[62],"studies":[63],"have":[64],"alleviated":[65],"this":[66,189],"issue":[67],"by":[68,223],"disaggregating":[69],"from":[73],"following":[76],"a":[77,149,166,191],"design":[78],"style":[79],"prefill-decode":[81],"disaggregation.":[82],"Nevertheless,":[83],"these":[84],"approaches":[85],"fail":[86],"fully":[88],"exploit":[89],"parallelism":[90,128],"both":[91,130],"within":[92,165],"individual":[93],"requests":[94,99],"(intra-request)":[95],"across":[97,183],"multiple":[98],"(inter-request).":[100],"To":[101],"overcome":[102],"limitation,":[104],"we":[105],"propose":[106],"REDServe,":[107],"LMM":[109],"system":[111],"that":[112,153,195,210],"orchestrates":[114],"intra-":[115,131,200],"inter-request":[117,133,170,202],"REDServe":[119,147,172,211],"is":[120],"designed":[121],"reduce":[123],"low":[124],"latency":[125,214],"maximize":[127],"at":[129],"granularities.":[134],"Built":[135],"on":[136,206],"disaggregated":[138],"architecture":[139],"adopts":[148],"fine-grained":[150],"scheduling":[151,193],"method":[152],"overlaps":[154],"with":[157,186],"forward":[159],"computation":[160],"model":[164],"single":[167],"request.":[168],"For":[169],"pipeline,":[171],"leverages":[173],"schedulable":[174],"tokens":[175],"token":[177],"budgets":[178],"balance":[180],"computational":[181],"loads":[182],"micro-batches.":[184],"Combined":[185],"chunked":[187],"prefill,":[188],"enables":[190],"novel":[192],"strategy":[194],"coordinates":[196],"execution":[198],"Experimental":[204],"evaluations":[205],"representative":[207],"show":[209],"achieves":[212],"substantial":[213],"reduction":[215],"up":[217,224],"66%":[219],"while":[220],"improving":[221],"throughput":[222],"109%,":[226],"significantly":[227],"outperforming":[228],"existing":[229],"approaches.":[231]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
