{"id":"https://openalex.org/W7119078185","doi":"https://doi.org/10.48550/arxiv.2601.01580","title":"The Two-Stage Decision-Sampling Hypothesis: Understanding the Emergence of Self-Reflection in RL-Trained LLMs","display_name":"The Two-Stage Decision-Sampling Hypothesis: Understanding the Emergence of Self-Reflection in RL-Trained LLMs","publication_year":2026,"publication_date":"2026-01-04","ids":{"openalex":"https://openalex.org/W7119078185","doi":"https://doi.org/10.48550/arxiv.2601.01580"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.01580","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01580","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.01580","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5075647402","display_name":"Zibo Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I55732556","display_name":"Arizona State University","ror":"https://ror.org/03efmqc40","country_code":"US","type":"education","lineage":["https://openalex.org/I55732556"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Zhao, Zibo","raw_affiliation_strings":["Arizona State University"],"affiliations":[{"raw_affiliation_string":"Arizona State University","institution_ids":["https://openalex.org/I55732556"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024973391","display_name":"Yuanting Zha","orcid":"https://orcid.org/0000-0001-7077-1167"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zha, Yuanting","raw_affiliation_strings":["ShanghaiTech University"],"affiliations":[{"raw_affiliation_string":"ShanghaiTech University","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100758734","display_name":"Haipeng Zhang","orcid":"https://orcid.org/0000-0001-9188-542X"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhang, Haipeng","raw_affiliation_strings":["ShanghaiTech University"],"affiliations":[{"raw_affiliation_string":"ShanghaiTech University","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049694379","display_name":"Xingcheng Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xu, Xingcheng","raw_affiliation_strings":["Shanghai Artificial Intelligence Laboratory"],"affiliations":[{"raw_affiliation_string":"Shanghai Artificial Intelligence Laboratory","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4391012619"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5075647402"],"corresponding_institution_ids":["https://openalex.org/I55732556"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3407999873161316,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3407999873161316,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.1454000025987625,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.049800001084804535,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.6323000192642212},{"id":"https://openalex.org/keywords/property","display_name":"Property (philosophy)","score":0.5343000292778015},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.48750001192092896},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.48080000281333923},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.44029998779296875},{"id":"https://openalex.org/keywords/attribution","display_name":"Attribution","score":0.37070000171661377}],"concepts":[{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6323000192642212},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.5343000292778015},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.48750001192092896},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.48080000281333923},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4715999960899353},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.46470001339912415},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.44029998779296875},{"id":"https://openalex.org/C144237770","wikidata":"https://www.wikidata.org/wiki/Q747534","display_name":"Mathematical economics","level":1,"score":0.40209999680519104},{"id":"https://openalex.org/C143299363","wikidata":"https://www.wikidata.org/wiki/Q900584","display_name":"Attribution","level":2,"score":0.37070000171661377},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33809998631477356},{"id":"https://openalex.org/C175444787","wikidata":"https://www.wikidata.org/wiki/Q39072","display_name":"Microeconomics","level":1,"score":0.31360000371932983},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.2921999990940094},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.2883000075817108},{"id":"https://openalex.org/C64357122","wikidata":"https://www.wikidata.org/wiki/Q1149766","display_name":"Causality (physics)","level":2,"score":0.2750000059604645},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.2596000134944916},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.25769999623298645},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.01580","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01580","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.01580","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01580","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.793802797794342,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Self-reflection":[0],"capabilities":[1,33],"emerge":[2],"in":[3,158],"Large":[4],"Language":[5],"Models":[6],"after":[7],"RL":[8,12,121],"post-training,":[9],"with":[10,103],"multi-turn":[11],"achieving":[13],"substantial":[14],"gains":[15],"over":[16],"SFT":[17,95,124],"counterparts.":[18],"Yet":[19],"the":[20,51,66,73],"mechanism":[21],"of":[22,34,119],"how":[23,57],"a":[24,152],"unified":[25],"optimization":[26],"objective":[27],"gives":[28],"rise":[29],"to":[30,40,55],"functionally":[31],"distinct":[32],"generating":[35],"solutions":[36],"and":[37,80,96],"evaluating":[38],"when":[39],"revise":[41],"them":[42],"remains":[43],"opaque.":[44],"To":[45],"address":[46],"this":[47],"question,":[48],"we":[49],"introduce":[50],"Gradient":[52,92,101],"Attribution":[53],"Property":[54],"characterize":[56],"reward":[58],"gradients":[59],"distribute":[60],"across":[61],"policy":[62,74],"components,":[63],"formalized":[64],"through":[65],"Two-Stage":[67],"Decision-Sampling":[68],"(DS)":[69],"Hypothesis,":[70],"which":[71],"decomposes":[72],"into":[75],"sampling":[76,149],"($\u03c0_{sample}$)":[77],"for":[78,83,156],"generation":[79],"decision":[81],"($\u03c0_{d}$)":[82,146],"verification.":[84],"We":[85,126],"prove":[86],"that":[87,108,137],"surrogate":[88],"rewards":[89],"exhibit":[90,99],"Balanced":[91],"Attribution,":[93,102],"while":[94,111],"KL":[97],"penalties":[98],"Unbalanced":[100],"length-weighting":[104],"creating":[105],"asymmetric":[106],"regularization":[107],"constrains":[109],"$\u03c0_{sample}$":[110],"leaving":[112],"$\u03c0_{d}$":[113],"under-optimized,":[114],"providing":[115,151],"an":[116],"theoretical":[117,131],"explanation":[118,155],"why":[120],"succeeds":[122],"where":[123],"fails.":[125],"also":[127],"empirically":[128],"validate":[129],"our":[130],"predictions":[132],"on":[133],"arithmetic":[134],"reasoning":[135],"demonstrates":[136],"RL's":[138],"superior":[139],"generalization":[140],"stems":[141],"primarily":[142],"from":[143],"improved":[144],"decision-making":[145],"rather":[147],"than":[148],"capabilities,":[150],"first-principles":[153],"mechanistic":[154],"self-correction":[157],"thinking":[159],"models.":[160]},"counts_by_year":[],"updated_date":"2026-01-08T20:10:11.968330","created_date":"2026-01-08T00:00:00"}
