{"id":"https://openalex.org/W7160949656","doi":"https://doi.org/10.48550/arxiv.2605.08527","title":"MARLaaS: Multi-Tenant Asynchronous Reinforcement Learning as a Service","display_name":"MARLaaS: Multi-Tenant Asynchronous Reinforcement Learning as a Service","publication_year":2026,"publication_date":"2026-05-08","ids":{"openalex":"https://openalex.org/W7160949656","doi":"https://doi.org/10.48550/arxiv.2605.08527"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.08527","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08527","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.08527","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081986793","display_name":"Timothy T. Yu","orcid":"https://orcid.org/0000-0001-8758-0578"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Timothy Tin Long","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135923992","display_name":"Gursimran Singh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Gursimran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135969021","display_name":"Ge Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Ge","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5094299388","display_name":"Hanieh Sadri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sadri, Hanieh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135913386","display_name":"Yong Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5021637858","display_name":"Zhenan Fan","orcid":"https://orcid.org/0000-0001-5116-2956"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Zhenan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.28529998660087585,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.28529998660087585,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.18770000338554382,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.1256999969482422,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.8140000104904175},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.7430999875068665},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6144999861717224},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5954999923706055},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.5142999887466431},{"id":"https://openalex.org/keywords/pace","display_name":"Pace","score":0.4634999930858612},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.42809998989105225},{"id":"https://openalex.org/keywords/model-checking","display_name":"Model checking","score":0.3481000065803528}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8202999830245972},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.8140000104904175},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.7430999875068665},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6144999861717224},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5954999923706055},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.5142999887466431},{"id":"https://openalex.org/C2777526511","wikidata":"https://www.wikidata.org/wiki/Q691543","display_name":"Pace","level":2,"score":0.4634999930858612},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.42809998989105225},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3653999865055084},{"id":"https://openalex.org/C110251889","wikidata":"https://www.wikidata.org/wiki/Q1569697","display_name":"Model checking","level":2,"score":0.3481000065803528},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.34709998965263367},{"id":"https://openalex.org/C2780378061","wikidata":"https://www.wikidata.org/wiki/Q25351891","display_name":"Service (business)","level":2,"score":0.335999995470047},{"id":"https://openalex.org/C2777072894","wikidata":"https://www.wikidata.org/wiki/Q4812204","display_name":"Asynchronous learning","level":5,"score":0.33309999108314514},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.319599986076355},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.31630000472068787},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.3059000074863434},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C16320812","wikidata":"https://www.wikidata.org/wiki/Q1812200","display_name":"Idle","level":2,"score":0.2535000145435333},{"id":"https://openalex.org/C188116033","wikidata":"https://www.wikidata.org/wiki/Q2664563","display_name":"Q-learning","level":3,"score":0.25189998745918274},{"id":"https://openalex.org/C7923308","wikidata":"https://www.wikidata.org/wiki/Q4812211","display_name":"Asynchronous system","level":5,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.08527","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08527","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.08527","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08527","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement":[0],"Learning":[1],"from":[2],"Verifiable":[3],"Rewards":[4],"(RLVR)":[5],"has":[6],"significantly":[7],"improved":[8],"the":[9,107],"reasoning":[10],"capabilities":[11],"of":[12],"large":[13],"language":[14],"models":[15,31],"(LLMs),":[16],"particularly":[17],"in":[18,114],"multi-turn":[19],"agentic":[20],"settings":[21,128],"involving":[22],"environment":[23,91],"interaction":[24],"like":[25],"tool":[26],"use.":[27],"However,":[28],"fine-tuning":[29,56],"such":[30],"remains":[32],"prohibitively":[33],"expensive":[34],"due":[35],"to":[36,104,132,147],"high":[37],"computational":[38],"requirements,":[39],"limiting":[40],"accessibility.":[41],"We":[42],"propose":[43],"MARLaaS":[44,136],"(Multi-tenant":[45],"Asynchronous":[46],"RL":[47,55,108],"as":[48],"a":[49,51,72,83],"Service),":[50],"system":[52],"for":[53],"concurrent":[54,134],"across":[57,75],"multiple":[58],"users":[59],"and":[60,81,93,123,149],"tasks.":[61],"Our":[62],"approach":[63],"is":[64],"based":[65],"on":[66],"two":[67],"key":[68],"ideas:":[69],"(1)":[70],"sharing":[71],"base":[73],"model":[74],"tenants":[76],"using":[77],"lightweight":[78],"LoRA":[79],"adapters,":[80],"(2)":[82],"disaggregated":[84],"asynchronous":[85],"architecture":[86],"that":[87],"decouples":[88],"rollout":[89],"generation,":[90],"interaction,":[92],"policy":[94],"training":[95,152],"into":[96],"independently":[97],"scheduled":[98],"stages.":[99],"This":[100],"design":[101],"enables":[102],"tasks":[103],"progress":[105],"through":[106],"pipeline":[109],"at":[110],"their":[111],"own":[112],"pace":[113],"an":[115],"event-driven":[116],"manner,":[117],"reducing":[118,150],"cross-task":[119],"interference,":[120],"idle":[121],"time,":[122],"end-to-end":[124,151],"latency.":[125],"In":[126],"multi-task":[127],"(we":[129],"report":[130],"up":[131,146],"32":[133],"tasks),":[135],"achieves":[137],"single-task":[138],"state-of-the-art":[139],"performance":[140],"while":[141],"improving":[142],"accelerator":[143],"utilization":[144],"by":[145,154],"4.3x":[148],"time":[153],"85%.":[155]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
