{"id":"https://openalex.org/W7160893033","doi":"https://doi.org/10.48550/arxiv.2605.09214","title":"Fast Rates for Offline Contextual Bandits with Forward-KL Regularization under Single-Policy Concentrability","display_name":"Fast Rates for Offline Contextual Bandits with Forward-KL Regularization under Single-Policy Concentrability","publication_year":2026,"publication_date":"2026-05-09","ids":{"openalex":"https://openalex.org/W7160893033","doi":"https://doi.org/10.48550/arxiv.2605.09214"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.09214","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09214","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.09214","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135951528","display_name":"Qingyue Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Qingyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135934126","display_name":"Kaixuan Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Kaixuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135931453","display_name":"Heyang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Heyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135967213","display_name":"Quanquan Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Quanquan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12101","display_name":"Advanced Bandit Algorithms Research","score":0.8321999907493591,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12101","display_name":"Advanced Bandit Algorithms Research","score":0.8321999907493591,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.1006999984383583,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.019700000062584877,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.5575000047683716},{"id":"https://openalex.org/keywords/upper-and-lower-bounds","display_name":"Upper and lower bounds","score":0.5511000156402588},{"id":"https://openalex.org/keywords/sample-complexity","display_name":"Sample complexity","score":0.5432999730110168},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.3727000057697296},{"id":"https://openalex.org/keywords/sample-size-determination","display_name":"Sample size determination","score":0.36500000953674316},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.36090001463890076},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.3260999917984009},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.31619998812675476},{"id":"https://openalex.org/keywords/statistical-model","display_name":"Statistical model","score":0.3131999969482422}],"concepts":[{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.5575000047683716},{"id":"https://openalex.org/C77553402","wikidata":"https://www.wikidata.org/wiki/Q13222579","display_name":"Upper and lower bounds","level":2,"score":0.5511000156402588},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5442000031471252},{"id":"https://openalex.org/C2778445095","wikidata":"https://www.wikidata.org/wiki/Q18354077","display_name":"Sample complexity","level":2,"score":0.5432999730110168},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.510699987411499},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.45489999651908875},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.3727000057697296},{"id":"https://openalex.org/C129848803","wikidata":"https://www.wikidata.org/wiki/Q2564360","display_name":"Sample size determination","level":2,"score":0.36500000953674316},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.36090001463890076},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3569999933242798},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.35499998927116394},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3260999917984009},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.31619998812675476},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.3131999969482422},{"id":"https://openalex.org/C14646407","wikidata":"https://www.wikidata.org/wiki/Q1430750","display_name":"Bellman equation","level":2,"score":0.30410000681877136},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.3025999963283539},{"id":"https://openalex.org/C2986587452","wikidata":"https://www.wikidata.org/wiki/Q938438","display_name":"Statistical analysis","level":2,"score":0.29249998927116394},{"id":"https://openalex.org/C3020318244","wikidata":"https://www.wikidata.org/wiki/Q4812187","display_name":"Large sample","level":2,"score":0.2847000062465668},{"id":"https://openalex.org/C2776291640","wikidata":"https://www.wikidata.org/wiki/Q2912517","display_name":"Value (mathematics)","level":2,"score":0.28119999170303345},{"id":"https://openalex.org/C311688","wikidata":"https://www.wikidata.org/wiki/Q2393193","display_name":"Time complexity","level":2,"score":0.28029999136924744},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.27390000224113464},{"id":"https://openalex.org/C196921405","wikidata":"https://www.wikidata.org/wiki/Q786431","display_name":"Online algorithm","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2653999924659729},{"id":"https://openalex.org/C3018263672","wikidata":"https://www.wikidata.org/wiki/Q1296251","display_name":"Efficient algorithm","level":2,"score":0.26190000772476196},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.25769999623298645},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.09214","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09214","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.09214","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09214","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"score":0.8135253190994263,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"\\emph{Kullback-Leibler}":[0],"(KL)":[1],"regularization":[2],"is":[3],"ubiquitous":[4],"in":[5,9,31,52,80,106,116,144,164],"reinforcement":[6],"learning":[7],"algorithms":[8],"the":[10,34,58,75,103,113,121,138,155,160,165,170],"form":[11],"of":[12,69,90,128,140,146],"\\emph{reverse}":[13],"or":[14,50],"\\emph{forward}":[15],"KL.":[16],"Recent":[17],"studies":[18],"have":[19],"demonstrated":[20],"$\u03b5^{-1}$-type":[21],"fast":[22],"rates":[23],"for":[24,40],"decision":[25],"making":[26],"under":[27,88],"reverse":[28],"KL":[29],"regularization,":[30],"contrast":[32],"to":[33,169],"standard":[35],"$\u03b5^{-2}$-type":[36],"sample":[37,157],"complexity.":[38],"However,":[39],"forward-KL-regularized":[41,70,156],"objectives,":[42],"existing":[43],"statistical":[44,147],"analyses":[45],"are":[46],"either":[47],"not":[48],"applicable":[49],"result":[51],"$\\tilde{O}(\u03b5^{-2})$":[53],"slow":[54,162],"rates.":[55,148],"We":[56,73],"take":[57],"first":[59,76],"step":[60],"towards":[61],"addressing":[62],"this":[63],"problem":[64],"via":[65],"a":[66,107],"streamlined":[67],"analysis":[68],"offline":[71],"CBs.":[72],"give":[74],"$\\tilde{O}(\u03b5^{-1})$":[77],"upper":[78,142],"bounds":[79,143,151],"tabular":[81],"and":[82,110],"general":[83],"function":[84],"approximation":[85],"settings,":[86],"both":[87],"notions":[89],"\\emph{single-policy":[91],"concentrability}.":[92],"In":[93],"particular,":[94],"our":[95,141],"convex-analytical":[96],"pipeline":[97],"unifies":[98],"these":[99],"settings":[100],"by":[101],"exploiting":[102],"pessimism":[104],"principle":[105],"novel":[108],"way":[109],"completely":[111],"bypasses":[112],"proof":[114],"routines":[115],"previous":[117],"works":[118],"based":[119],"on":[120],"mean":[122],"value":[123],"theorem,":[124],"which":[125],"might":[126],"be":[127],"independent":[129],"interest.":[130],"Moreover,":[131],"we":[132],"provide":[133],"rate-optimal":[134],"lower":[135,150],"bounds,":[136],"manifesting":[137],"tightness":[139],"terms":[145],"Our":[149],"also":[152],"demonstrate":[153],"that":[154],"complexity":[158],"recovers":[159],"unregularized":[161],"rate":[163],"low-regularization":[166],"regime,":[167],"similarly":[168],"reverse-KL":[171],"regularization.":[172]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-13T00:00:00"}
