{"id":"https://openalex.org/W7134893184","doi":"https://doi.org/10.48550/arxiv.2603.09344","title":"Robust Regularized Policy Iteration under Transition Uncertainty","display_name":"Robust Regularized Policy Iteration under Transition Uncertainty","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134893184","doi":"https://doi.org/10.48550/arxiv.2603.09344"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.09344","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09344","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.09344","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128787362","display_name":"Hongqiang Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lin, Hongqiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113584851","display_name":"Zhenghui Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Zhenghui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128710090","display_name":"Weihao Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Weihao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128760889","display_name":"Pengfei Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Pengfei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128764088","display_name":"Yiding Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Yiding","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128698483","display_name":"Qixian Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Qixian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128767533","display_name":"Dongxu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Dongxu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5128787362"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.7989000082015991,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.7989000082015991,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.033799998462200165,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11206","display_name":"Model Reduction and Neural Networks","score":0.030700000002980232,"subfield":{"id":"https://openalex.org/subfields/3109","display_name":"Statistical and Nonlinear Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/monotonic-function","display_name":"Monotonic function","score":0.5289000272750854},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.527899980545044},{"id":"https://openalex.org/keywords/extrapolation","display_name":"Extrapolation","score":0.5270000100135803},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.47999998927116394},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.46639999747276306},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.4180999994277954},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.39719998836517334},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.37209999561309814}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5365999937057495},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.5329999923706055},{"id":"https://openalex.org/C72169020","wikidata":"https://www.wikidata.org/wiki/Q194404","display_name":"Monotonic function","level":2,"score":0.5289000272750854},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.527899980545044},{"id":"https://openalex.org/C132459708","wikidata":"https://www.wikidata.org/wiki/Q744069","display_name":"Extrapolation","level":2,"score":0.5270000100135803},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.47999998927116394},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.46639999747276306},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.4180999994277954},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.39719998836517334},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.37209999561309814},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.37130001187324524},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.358599990606308},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.31380000710487366},{"id":"https://openalex.org/C106189395","wikidata":"https://www.wikidata.org/wiki/Q176789","display_name":"Markov decision process","level":3,"score":0.31299999356269836},{"id":"https://openalex.org/C159694833","wikidata":"https://www.wikidata.org/wiki/Q2321565","display_name":"Iterative method","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C2776291640","wikidata":"https://www.wikidata.org/wiki/Q2912517","display_name":"Value (mathematics)","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.2858000099658966},{"id":"https://openalex.org/C182365436","wikidata":"https://www.wikidata.org/wiki/Q50701","display_name":"Variable (mathematics)","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C14646407","wikidata":"https://www.wikidata.org/wiki/Q1430750","display_name":"Bellman equation","level":2,"score":0.2793999910354614},{"id":"https://openalex.org/C32230216","wikidata":"https://www.wikidata.org/wiki/Q7882499","display_name":"Uncertainty quantification","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2745000123977661},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.27239999175071716},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.2533000111579895}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.09344","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09344","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.09344","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09344","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.7646186947822571}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Offline":[0],"reinforcement":[1],"learning":[2,9],"(RL)":[3],"enables":[4],"data-efficient":[5],"and":[6,32,41,68,95,122],"safe":[7],"policy":[8,23,54,71,99,182],"without":[10],"online":[11],"exploration,":[12],"but":[13],"its":[14],"performance":[15,170],"often":[16],"degrades":[17],"under":[18],"distribution":[19],"shift.":[20],"The":[21],"learned":[22,33],"may":[24],"visit":[25],"out-of-distribution":[26,186],"state-action":[27],"pairs":[28],"where":[29],"value":[30],"estimates":[31],"dynamics":[34],"are":[35],"unreliable.":[36],"To":[37],"address":[38],"policy-induced":[39],"extrapolation":[40],"transition":[42,58],"uncertainty":[43,66],"in":[44],"a":[45,61,91,104,120],"unified":[46],"framework,":[47],"we":[48],"formulate":[49],"offline":[50],"RL":[51],"as":[52,60],"robust":[53,105,134,169],"optimization,":[55],"treating":[56],"the":[57,70,73,85,116,126,132,156,164,181],"kernel":[59],"decision":[62],"variable":[63],"within":[64],"an":[65,97],"set":[67],"optimizing":[69],"against":[72],"worst-case":[74],"dynamics.":[75],"We":[76,109],"propose":[77],"Robust":[78],"Regularized":[79],"Policy":[80],"Iteration":[81],"(RRPI),":[82],"which":[83,179],"replaces":[84],"intractable":[86],"max-min":[87],"bilevel":[88],"objective":[89,135],"with":[90,136,175],"tractable":[92],"KL-regularized":[93],"surrogate":[94,127],"derives":[96],"efficient":[98],"iteration":[100],"procedure":[101],"based":[102],"on":[103,139,155,163],"regularized":[106],"Bellman":[107],"operator.":[108],"provide":[110],"theoretical":[111],"guarantees":[112],"by":[113,171],"showing":[114],"that":[115,123,143],"proposed":[117],"operator":[118],"is":[119],"$\u03b3$-contraction":[121],"iteratively":[124],"updating":[125],"yields":[128],"monotonic":[129],"improvement":[130],"of":[131,158],"original":[133],"convergence.":[137],"Experiments":[138],"D4RL":[140],"benchmarks":[141],"demonstrate":[142],"RRPI":[144,167],"achieves":[145],"strong":[146],"average":[147],"performance,":[148],"outperforming":[149],"recent":[150],"baselines":[151],"including":[152],"percentile-based":[153],"methods":[154],"majority":[157],"environments":[159],"while":[160],"remaining":[161],"competitive":[162],"rest.":[165],"Moreover,":[166],"exhibits":[168],"aligning":[172],"lower":[173],"$Q$-values":[174],"high":[176],"epistemic":[177],"uncertainty,":[178],"prevents":[180],"from":[183],"executing":[184],"unreliable":[185],"actions.":[187]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-12T00:00:00"}
