{"id":"https://openalex.org/W7140288136","doi":"https://doi.org/10.48550/arxiv.2603.23355","title":"Off-Policy Value-Based Reinforcement Learning for Large Language Models","display_name":"Off-Policy Value-Based Reinforcement Learning for Large Language Models","publication_year":2026,"publication_date":"2026-03-24","ids":{"openalex":"https://openalex.org/W7140288136","doi":"https://doi.org/10.48550/arxiv.2603.23355"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.23355","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23355","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.23355","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130601620","display_name":"Peng-Yuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Peng-Yuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130605259","display_name":"Ziniu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Ziniu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130589524","display_name":"Tian Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Tian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130552708","display_name":"Bohan Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Bohan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130584294","display_name":"Tian-Shuo Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Tian-Shuo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130614651","display_name":"Chenyang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, ChenYang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130575539","display_name":"Xiong-Hui Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Xiong-Hui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130550370","display_name":"Yi-Chen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yi-Chen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130582704","display_name":"Tianyun Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Tianyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027220244","display_name":"Congliang Chen","orcid":"https://orcid.org/0000-0002-9795-4200"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Congliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130550475","display_name":"Yang Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Yang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5130601620"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3407000005245209,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3407000005245209,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.13809999823570251,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.06759999692440033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7921000123023987},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6901000142097473},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.623199999332428},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5917999744415283},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.49380001425743103},{"id":"https://openalex.org/keywords/outcome","display_name":"Outcome (game theory)","score":0.4025999903678894},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3555999994277954}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7921000123023987},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7376000285148621},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6901000142097473},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6244000196456909},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.623199999332428},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5936999917030334},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5917999744415283},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.49380001425743103},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.4025999903678894},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3555999994277954},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.2985000014305115},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2888999879360199},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2741999924182892},{"id":"https://openalex.org/C77967617","wikidata":"https://www.wikidata.org/wiki/Q4677561","display_name":"Active learning (machine learning)","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.25619998574256897}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.23355","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23355","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.23355","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23355","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Improving":[0],"data":[1,34],"utilization":[2],"efficiency":[3,122],"is":[4,17,144],"critical":[5],"for":[6,11,24,59,151],"scaling":[7],"reinforcement":[8],"learning":[9],"(RL)":[10],"long-horizon":[12],"tasks":[13],"where":[14],"generating":[15],"trajectories":[16],"expensive.":[18],"However,":[19],"the":[20],"dominant":[21],"RL":[22,57,143],"methods":[23,150],"LLMs":[25,60],"are":[26],"largely":[27],"on-policy:":[28],"they":[29],"update":[30],"each":[31],"batch":[32],"of":[33,94,126],"only":[35,107],"once,":[36],"discard":[37],"it,":[38],"and":[39,123,130],"then":[40],"collect":[41],"fresh":[42],"samples,":[43],"resulting":[44],"in":[45,114,128,132],"poor":[46],"sample":[47],"efficiency.":[48],"In":[49],"this":[50],"work,":[51],"we":[52],"explore":[53],"an":[54],"alternative":[55,147],"value-based":[56,142],"framework":[58],"that":[61,72,104,141],"naturally":[62,87],"enables":[63],"off-policy":[64],"learning.":[65],"We":[66],"propose":[67],"ReVal,":[68],"a":[69,145],"Bellman-update-based":[70],"method":[71],"combines":[73],"stepwise":[74],"signals":[75,81],"capturing":[76],"internal":[77],"consistency":[78],"with":[79],"trajectory-level":[80],"derived":[82],"from":[83],"outcome":[84],"verification.":[85],"ReVal":[86,105,119],"supports":[88],"replay-buffer-based":[89],"training,":[90],"allowing":[91],"efficient":[92],"reuse":[93],"past":[95],"trajectories.":[96],"Experiments":[97],"on":[98],"standard":[99],"mathematical":[100],"reasoning":[101],"benchmarks":[102],"show":[103],"not":[106],"converges":[108],"faster":[109],"but":[110],"also":[111],"outperforms":[112],"GRPO":[113],"final":[115],"performance.":[116],"On":[117],"DeepSeek-R1-Distill-1.5B,":[118],"improves":[120],"training":[121],"achieves":[124],"improvement":[125],"2.7%":[127],"AIME24":[129],"4.5%":[131],"out-of-domain":[133],"benchmark":[134],"GPQA":[135],"over":[136],"GRPO.":[137],"These":[138],"results":[139],"suggest":[140],"practical":[146],"to":[148],"policy-based":[149],"LLM":[152],"training.":[153]},"counts_by_year":[],"updated_date":"2026-03-26T06:10:45.909354","created_date":"2026-03-26T00:00:00"}
