{"id":"https://openalex.org/W7154597459","doi":"https://doi.org/10.48550/arxiv.2604.14142","title":"From $P(y|x)$ to $P(y)$: Investigating Reinforcement Learning in Pre-train Space","display_name":"From $P(y|x)$ to $P(y)$: Investigating Reinforcement Learning in Pre-train Space","publication_year":2026,"publication_date":"2026-04-15","ids":{"openalex":"https://openalex.org/W7154597459","doi":"https://doi.org/10.48550/arxiv.2604.14142"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.14142","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14142","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.14142","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133798220","display_name":"Yuqiao Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Tan, Yuqiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113266707","display_name":"Minzheng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Minzheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133812261","display_name":"Bo Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Bo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133757056","display_name":"Zichen Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zichen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133783206","display_name":"Tian Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Tian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133790002","display_name":"Shizhu He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Shizhu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133816478","display_name":"Jun Zhao","orcid":"https://orcid.org/0009-0005-6628-9390"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Jun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133824667","display_name":"Kang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Kang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5133798220"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.6718999743461609,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.6718999743461609,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.05660000070929527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.05420000106096268,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7098000049591064},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.5412999987602234},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.521399974822998},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4869000017642975},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.45899999141693115},{"id":"https://openalex.org/keywords/reflection","display_name":"Reflection (computer programming)","score":0.42570000886917114},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.4124999940395355}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7098000049591064},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6521999835968018},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.5412999987602234},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5249000191688538},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.521399974822998},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4869000017642975},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.45899999141693115},{"id":"https://openalex.org/C65682993","wikidata":"https://www.wikidata.org/wiki/Q1056451","display_name":"Reflection (computer programming)","level":2,"score":0.42570000886917114},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.4124999940395355},{"id":"https://openalex.org/C34388435","wikidata":"https://www.wikidata.org/wiki/Q2267362","display_name":"Bounded function","level":2,"score":0.3677999973297119},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3422999978065491},{"id":"https://openalex.org/C110121322","wikidata":"https://www.wikidata.org/wiki/Q865811","display_name":"Distribution (mathematics)","level":2,"score":0.3402999937534332},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.3395000100135803},{"id":"https://openalex.org/C111030470","wikidata":"https://www.wikidata.org/wiki/Q1430460","display_name":"Curse of dimensionality","level":2,"score":0.3253999948501587},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.27639999985694885},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.2687999904155731},{"id":"https://openalex.org/C2781002164","wikidata":"https://www.wikidata.org/wiki/Q6822311","display_name":"Meta learning (computer science)","level":3,"score":0.26409998536109924},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.26170000433921814}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.14142","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14142","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.14142","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14142","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"reinforcement":[1],"learning":[2],"with":[3,168],"verifiable":[4],"rewards":[5],"(RLVR)":[6],"significantly":[7],"enhances":[8],"LLM":[9],"reasoning":[10,43,68,135,173,205],"by":[11,22,41,147],"optimizing":[12],"the":[13,23,30,35,92,172,199],"conditional":[14],"distribution":[15,32,63],"P(y|x),":[16,101],"its":[17],"potential":[18],"is":[19],"fundamentally":[20],"bounded":[21],"base":[24],"model's":[25],"existing":[26],"output":[27],"distribution.":[28],"Optimizing":[29],"marginal":[31],"P(y)":[33,98],"in":[34],"Pre-train":[36],"Space":[37,77,158],"addresses":[38],"this":[39,71],"bottleneck":[40],"encoding":[42],"ability":[44],"and":[45,89,99,144,149],"preserving":[46],"broad":[47],"exploration":[48],"capacity.":[49],"Yet,":[50],"conventional":[51],"pre-training":[52],"relies":[53],"on":[54],"static":[55],"corpora":[56],"for":[57,108,129,180],"passive":[58],"learning,":[59],"leading":[60],"to":[61,85,170,177],"a":[62,105,114,161,202],"shift":[64],"that":[65,165,186,193],"hinders":[66],"targeted":[67],"enhancement.":[69],"In":[70],"paper,":[72],"we":[73,112,155],"introduce":[74],"PreRL":[75,103,122],"(Pre-train":[76],"RL),":[78],"which":[79],"applies":[80],"reward-driven":[81],"online":[82],"updates":[83],"directly":[84],"P(y).":[86],"We":[87],"theoretically":[88],"empirically":[90],"validate":[91],"strong":[93,190],"gradient":[94],"alignment":[95],"between":[96],"log":[97,100],"establishing":[102],"as":[104,124],"viable":[106],"surrogate":[107],"standard":[109,178],"RL.":[110],"Furthermore,":[111],"uncover":[113],"critical":[115],"mechanism:":[116],"Negative":[117],"Sample":[118],"Reinforcement":[119],"(NSR)":[120],"within":[121],"serves":[123],"an":[125],"exceptionally":[126],"effective":[127],"driver":[128],"reasoning.":[130],"NSR-PreRL":[131,169],"rapidly":[132],"prunes":[133],"incorrect":[134],"spaces":[136],"while":[137],"stimulating":[138],"endogenous":[139],"reflective":[140],"behaviors,":[141],"increasing":[142],"transition":[143],"reflection":[145],"thoughts":[146],"14.89x":[148],"6.54x,":[150],"respectively.":[151],"Leveraging":[152],"these":[153],"insights,":[154],"propose":[156],"Dual":[157],"RL":[159,179],"(DSRL),":[160],"Policy":[162],"Reincarnation":[163],"strategy":[164],"initializes":[166],"models":[167],"expand":[171],"horizon":[174],"before":[175],"transitioning":[176],"fine-grained":[181],"optimization.":[182],"Extensive":[183],"experiments":[184],"demonstrate":[185],"DSRL":[187],"consistently":[188],"outperforms":[189],"baselines,":[191],"proving":[192],"pre-train":[194],"space":[195],"pruning":[196],"effectively":[197],"steers":[198],"policy":[200],"toward":[201],"refined":[203],"correct":[204],"subspace.":[206]},"counts_by_year":[],"updated_date":"2026-04-17T06:04:52.305304","created_date":"2026-04-17T00:00:00"}
