{"id":"https://openalex.org/W7153047165","doi":"https://doi.org/10.48550/arxiv.2604.07941","title":"Large Language Model Post-Training: A Unified View of Off-Policy and On-Policy Learning","display_name":"Large Language Model Post-Training: A Unified View of Off-Policy and On-Policy Learning","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7153047165","doi":"https://doi.org/10.48550/arxiv.2604.07941"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.07941","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07941","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.07941","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5104579457","display_name":"Shiwan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Shiwan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019178807","display_name":"Zhihu Wang","orcid":"https://orcid.org/0000-0002-5991-0418"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhihu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133341498","display_name":"Xuyang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Xuyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133336295","display_name":"Jiaming Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Jiaming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133384652","display_name":"Caiyue Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Caiyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123661444","display_name":"Chenfei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Chenfei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133386245","display_name":"Liting Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Liting","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133376499","display_name":"Yuhang Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia, Yuhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133348886","display_name":"Yanzhe Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yanzhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130096817","display_name":"Hualong Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Hualong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133356835","display_name":"Zichen Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zichen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001810544","display_name":"Qiaowei Li","orcid":"https://orcid.org/0000-0002-5987-9465"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Qicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133371806","display_name":"Yong Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Yong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11840000003576279,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11840000003576279,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.1145000010728836,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.08889999985694885,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6398000121116638},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5110999941825867},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.4823000133037567},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.40610000491142273},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.3596000075340271},{"id":"https://openalex.org/keywords/reading","display_name":"Reading (process)","score":0.3449999988079071},{"id":"https://openalex.org/keywords/policy-learning","display_name":"Policy learning","score":0.3158000111579895}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.679099977016449},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6398000121116638},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5296000242233276},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5110999941825867},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.4823000133037567},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.40610000491142273},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.3596000075340271},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.3449999988079071},{"id":"https://openalex.org/C2779436431","wikidata":"https://www.wikidata.org/wiki/Q30672407","display_name":"Policy learning","level":2,"score":0.3158000111579895},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C2780665704","wikidata":"https://www.wikidata.org/wiki/Q959298","display_name":"Intervention (counseling)","level":2,"score":0.288100004196167},{"id":"https://openalex.org/C74672266","wikidata":"https://www.wikidata.org/wiki/Q815859","display_name":"Language acquisition","level":2,"score":0.2847000062465668},{"id":"https://openalex.org/C94922259","wikidata":"https://www.wikidata.org/wiki/Q33215","display_name":"Constructed language","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.2754000127315521},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C2776014549","wikidata":"https://www.wikidata.org/wiki/Q3050847","display_name":"Consolidation (business)","level":2,"score":0.26409998536109924},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.25429999828338623}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.07941","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07941","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.07941","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07941","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.5863020420074463,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Post-training":[0],"has":[1],"become":[2],"central":[3],"to":[4,170],"turning":[5],"pretrained":[6],"large":[7],"language":[8],"models":[9],"(LLMs)":[10],"into":[11],"aligned,":[12],"capable,":[13],"and":[14,33,92,116,138,144,202,217],"deployable":[15],"systems.":[16],"Recent":[17],"progress":[18,224],"spans":[19],"supervised":[20],"fine-tuning":[21],"(SFT),":[22],"preference":[23,159],"optimization,":[24],"reinforcement":[25],"learning":[26,87,94],"(RL),":[27],"process":[28],"supervision,":[29],"verifier-guided":[30],"methods,":[31],"distillation,":[32],"multi-stage":[34,208],"pipelines.":[35],"Yet":[36],"these":[37],"methods":[38,101],"are":[39],"often":[40,175,193],"discussed":[41],"in":[42],"fragmented":[43],"ways,":[44],"organized":[45],"by":[46,52,78],"labels":[47],"or":[48,156],"objectives":[49],"rather":[50,198,231],"than":[51,199,232],"the":[53,75,211],"behavioral":[54,133],"bottlenecks":[55,216],"they":[56],"address.":[57],"This":[58],"survey":[59],"argues":[60],"that":[61,223],"LLM":[62],"post-training":[63,215],"is":[64,161,192],"best":[65],"understood":[66,195],"as":[67,196,206],"structured":[68],"intervention":[69],"on":[70,88,95,178,227],"model":[71,145],"behavior.":[72],"We":[73,98],"organize":[74],"field":[76],"first":[77],"trajectory":[79],"provenance,":[80],"which":[81,110,119,135],"defines":[82],"two":[83,103],"primary":[84],"regimes:":[85],"off-policy":[86,163],"externally":[89],"supplied":[90],"trajectories":[91],"on-policy":[93],"learner-generated":[96,171,179],"rollouts.":[97],"then":[99],"interpret":[100],"through":[102],"recurring":[104],"roles":[105],"--":[106,126],"effective":[107],"support":[108,154],"expansion,":[109],"makes":[111],"useful":[112,140],"behaviors":[113],"more":[114],"reachable,":[115],"policy":[117,157],"reshaping,":[118,164],"improves":[120,176],"behavior":[121,141,177],"within":[122],"already":[123],"reachable":[124],"regions":[125],"together":[127],"with":[128],"a":[129],"complementary":[130],"systems-level":[131],"role,":[132],"consolidation,":[134],"preserves,":[136],"transfers,":[137],"amortizes":[139],"across":[142],"stages":[143],"transitions.":[146],"Under":[147],"this":[148],"view,":[149],"SFT":[150],"may":[151],"serve":[152],"either":[153],"expansion":[155],"reshaping;":[158],"optimization":[160],"usually":[162],"though":[165],"online":[166],"variants":[167],"move":[168],"closer":[169],"states.":[172],"On-policy":[173],"RL":[174],"states,":[180],"but":[181],"stronger":[182],"guidance":[183],"can":[184],"also":[185],"make":[186],"hard-to-reach":[187],"reasoning":[188],"paths":[189],"reachable.":[190],"Distillation":[191],"better":[194],"consolidation":[197],"only":[200],"compression,":[201],"hybrid":[203],"pipelines":[204],"emerge":[205],"coordinated":[207,228],"compositions.":[209],"Overall,":[210],"framework":[212],"helps":[213],"diagnose":[214],"reason":[218],"about":[219],"stage":[220],"composition,":[221],"suggesting":[222],"increasingly":[225],"depends":[226],"systems":[229],"design":[230],"any":[233],"single":[234],"dominant":[235],"objective.":[236]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-11T00:00:00"}
