{"id":"https://openalex.org/W7160534927","doi":"https://doi.org/10.48550/arxiv.2605.05123","title":"Adaptive Policy Selection and Fine-Tuning under Interaction Budgets for Offline-to-Online Reinforcement Learning","display_name":"Adaptive Policy Selection and Fine-Tuning under Interaction Budgets for Offline-to-Online Reinforcement Learning","publication_year":2026,"publication_date":"2026-05-06","ids":{"openalex":"https://openalex.org/W7160534927","doi":"https://doi.org/10.48550/arxiv.2605.05123"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.05123","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05123","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.05123","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083621014","display_name":"Alper Kamil Bozkurt","orcid":"https://orcid.org/0000-0001-5845-4003"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bozkurt, Alper Kamil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135592716","display_name":"Xiaoan Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Xiaoan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135543675","display_name":"Shangtong Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shangtong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135600484","display_name":"Miroslav Pajic","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pajic, Miroslav","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5061499121","display_name":"Yuichi Motai","orcid":"https://orcid.org/0000-0002-1957-1896"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Motai, Yuichi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.909600019454956,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.909600019454956,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12101","display_name":"Advanced Bandit Algorithms Research","score":0.010099999606609344,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.008100000210106373,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.804099977016449},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.6812000274658203},{"id":"https://openalex.org/keywords/a-priori-and-a-posteriori","display_name":"A priori and a posteriori","score":0.6680999994277954},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6248000264167786},{"id":"https://openalex.org/keywords/online-algorithm","display_name":"Online algorithm","score":0.4090999960899353},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.36329999566078186}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.804099977016449},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7925999760627747},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.6812000274658203},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.6680999994277954},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6248000264167786},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5361999869346619},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49239999055862427},{"id":"https://openalex.org/C196921405","wikidata":"https://www.wikidata.org/wiki/Q786431","display_name":"Online algorithm","level":2,"score":0.4090999960899353},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.36329999566078186},{"id":"https://openalex.org/C2986087404","wikidata":"https://www.wikidata.org/wiki/Q15946010","display_name":"Online learning","level":2,"score":0.33739998936653137},{"id":"https://openalex.org/C2780102126","wikidata":"https://www.wikidata.org/wiki/Q10928179","display_name":"Online and offline","level":2,"score":0.32280001044273376},{"id":"https://openalex.org/C2777851325","wikidata":"https://www.wikidata.org/wiki/Q7094102","display_name":"Online model","level":2,"score":0.3116999864578247},{"id":"https://openalex.org/C2779436431","wikidata":"https://www.wikidata.org/wiki/Q30672407","display_name":"Policy learning","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.29600000381469727}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.05123","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05123","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.05123","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05123","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.44974464178085327,"id":"https://metadata.un.org/sdg/17","display_name":"Partnerships for the goals"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0,25,161],"offline-to-online":[1],"reinforcement":[2],"learning":[3],"(O2O-RL),":[4],"policies":[5,31,192,216],"are":[6,36,137],"first":[7,186],"safely":[8],"trained":[9,32],"offline":[10,34,195],"using":[11],"previously":[12],"collected":[13],"datasets":[14],"and":[15,57,157,173,198,213],"then":[16,55,201],"further":[17],"fine-tuned":[18],"for":[19,100,170],"tasks":[20],"via":[21,38,222],"limited":[22],"online":[23,44,93,176,231],"interactions.":[24,232],"a":[26,77,88,112,115,128,133,144,166,188],"typical":[27],"O2O-RL":[28,240],"pipeline,":[29,184],"candidate":[30,191],"with":[33,49,91,120,193,242],"RL":[35,196],"evaluated":[37],"either":[39],"off-policy":[40],"evaluation":[41,45],"(OPE)":[42],"or":[43],"(OE).":[46],"The":[47],"policy":[48,78,90,117,136,171],"the":[50,182,215],"highest":[51],"estimated":[52],"value":[53],"is":[54,105,158],"deployed":[56,135],"continually":[58],"fine-tuned.":[59],"However,":[60],"this":[61,162],"setup":[62],"has":[63],"two":[64],"main":[65],"issues.":[66],"First,":[67],"OPE":[68,203],"can":[69],"be":[70],"unreliable,":[71],"making":[72,227],"it":[73],"risky":[74],"to":[75,110,132,204],"deploy":[76],"based":[79,217],"solely":[80],"on":[81,218],"those":[82],"estimates,":[83],"whereas":[84],"OE":[85],"may":[86],"identify":[87],"viable":[89],"substantial":[92],"interaction,":[94],"which":[95],"could":[96],"have":[97],"been":[98],"used":[99],"fine-tuning.":[101],"Second--and":[102],"more":[103],"importantly--it":[104],"also":[106],"often":[107],"not":[108],"possible":[109],"determine":[111],"priori":[113],"whether":[114],"pretrained":[116],"will":[118],"improve":[119],"post-deployment":[121],"fine-tuning,":[122],"especially":[123],"in":[124,139,179],"non-stationary":[125],"environments.":[126],"As":[127],"result,":[129],"procedures":[130],"committing":[131],"single":[134],"impractical":[138],"many":[140],"real-world":[141],"settings.":[142],"Moreover,":[143],"naive":[145],"remedy":[146],"that":[147,235],"exhaustively":[148],"fine-tunes":[149],"all":[150],"candidates":[151],"would":[152],"violate":[153],"interaction":[154,177],"budget":[155],"constraints":[156],"likewise":[159],"infeasible.":[160],"paper,":[163],"we":[164,185,200],"propose":[165],"novel":[167],"adaptive":[168],"approach":[169,225,237],"selection":[172],"fine-tuning":[174],"under":[175],"budgets":[178],"O2O-RL.":[180],"Following":[181],"standard":[183],"train":[187],"set":[189],"of":[190,230],"different":[194],"algorithms":[197],"hyperparameters;":[199],"perform":[202],"obtain":[205],"initial":[206],"performance":[207,221],"estimates.":[208],"We":[209,233],"next":[210],"adaptively":[211],"select":[212],"fine-tune":[214],"their":[219],"predicted":[220],"an":[223],"upper-confidence-bound":[224],"thereby":[226],"efficient":[228],"use":[229],"demonstrate":[234],"our":[236],"improves":[238],"upon":[239],"baselines":[241],"various":[243],"benchmarks.":[244]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-08T00:00:00"}
