{"id":"https://openalex.org/W7162405429","doi":"https://doi.org/10.48550/arxiv.2605.25507","title":"Credit Assignment with Resets in Language Model Reasoning","display_name":"Credit Assignment with Resets in Language Model Reasoning","publication_year":2026,"publication_date":"2026-05-25","ids":{"openalex":"https://openalex.org/W7162405429","doi":"https://doi.org/10.48550/arxiv.2605.25507"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.25507","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25507","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.25507","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137004106","display_name":"Ankur Samanta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Samanta, Ankur","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102706189","display_name":"Akshayaa Magesh","orcid":"https://orcid.org/0000-0002-4627-9321"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Magesh, Akshayaa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137060130","display_name":"Ayush Jain","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jain, Ayush","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137051296","display_name":"Youliang Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Youliang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124968669","display_name":"Daniel Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Daniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124910710","display_name":"Kavosh Asadi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Asadi, Kavosh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124968669","display_name":"Daniel Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hassani, Kaveh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072012496","display_name":"Kaveh Hassani","orcid":"https://orcid.org/0000-0001-9162-9442"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sajda, Paul","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137001743","display_name":"Paul Sajda","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhandari, Jalaj","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5038161679","display_name":"Jalaj Bhandari","orcid":"https://orcid.org/0000-0002-7115-8986"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Efroni, Yonathan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2874000072479248,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2874000072479248,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.2371000051498413,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1039000004529953,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reset","display_name":"Reset (finance)","score":0.5583000183105469},{"id":"https://openalex.org/keywords/oracle","display_name":"Oracle","score":0.5339999794960022},{"id":"https://openalex.org/keywords/outcome","display_name":"Outcome (game theory)","score":0.5216000080108643},{"id":"https://openalex.org/keywords/counterfactual-thinking","display_name":"Counterfactual thinking","score":0.5131000280380249},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.4864000082015991},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.44909998774528503},{"id":"https://openalex.org/keywords/estimator","display_name":"Estimator","score":0.3785000145435333},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.36309999227523804}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7297999858856201},{"id":"https://openalex.org/C2779795794","wikidata":"https://www.wikidata.org/wiki/Q7315343","display_name":"Reset (finance)","level":2,"score":0.5583000183105469},{"id":"https://openalex.org/C55166926","wikidata":"https://www.wikidata.org/wiki/Q2892946","display_name":"Oracle","level":2,"score":0.5339999794960022},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.5216000080108643},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.5131000280380249},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.4864000082015991},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4641999900341034},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.44909998774528503},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.3785000145435333},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.36309999227523804},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34209999442100525},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.34200000762939453},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C110251889","wikidata":"https://www.wikidata.org/wiki/Q1569697","display_name":"Model checking","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.314300000667572},{"id":"https://openalex.org/C2779804580","wikidata":"https://www.wikidata.org/wiki/Q102047","display_name":"Suffix","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.2883000075817108},{"id":"https://openalex.org/C2780922921","wikidata":"https://www.wikidata.org/wiki/Q255189","display_name":"Paraphrase","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2768000066280365},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26989999413490295},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.25679999589920044},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.2540000081062317},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.25507","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25507","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.25507","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25507","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Contemporary":[0],"reinforcement":[1],"learning":[2,178],"with":[3,142,187],"verifiable":[4],"reward":[5,18],"methods":[6,132],"post-train":[7],"language":[8],"models":[9,157],"on":[10],"multi-step":[11],"reasoning":[12,50,108,159],"by":[13,44,69,168],"assigning":[14],"a":[15,24,143,174],"single":[16],"outcome":[17,81],"uniformly":[19],"across":[20],"all":[21],"tokens":[22],"in":[23,122],"trajectory.":[25],"Such":[26],"uniform":[27],"assignment":[28,39,68],"ignores":[29],"which":[30],"steps":[31],"contributed":[32],"to":[33,71,86],"success":[34],"or":[35],"failure.":[36],"Improving":[37],"credit":[38,67],"can":[40,83],"address":[41],"this":[42],"limitation":[43],"enabling":[45,64],"targeted":[46],"refinement":[47],"of":[48],"faulty":[49],"steps,":[51,109],"rather":[52],"than":[53],"updating":[54],"entire":[55],"trajectories":[56],"uniformly.":[57],"Resets":[58],"are":[59,104],"one":[60],"such":[61,95],"simple":[62],"mechanism,":[63],"more":[65],"precise":[66],"returning":[70],"an":[72,123],"intermediate":[73],"state":[74],"and":[75,110,126,158,166,177],"resampling":[76],"counterfactual":[77],"continuations,":[78],"so":[79],"that":[80,90,146],"differences":[82],"be":[84],"attributed":[85],"decisions":[87],"made":[88],"at":[89,173],"point.":[91],"We":[92,129],"propose":[93],"two":[94],"methods:":[96],"Random-Reset":[97],"Policy":[98,112,136],"Optimization":[99,113],"(RRPO),":[100],"where":[101,115],"reset":[102,176],"states":[103,149],"drawn":[105],"randomly":[106],"from":[107,179],"Self-Reset":[111],"(SRPO),":[114],"the":[116,119,134,184],"model":[117,185],"self-localizes":[118],"erroneous":[120],"step":[121],"incorrect":[124],"trajectory":[125],"resets":[127],"there.":[128],"analyze":[130],"these":[131],"within":[133],"Conservative":[135],"Iteration":[137],"(CPI)":[138],"framework.":[139],"Extending":[140],"CPI":[141],"credit-assignment":[144],"oracle":[145],"targets":[147],"improvable":[148],"yields":[150],"provable":[151],"improvements":[152],"over":[153],"random":[154],"resets.":[155],"Across":[156],"benchmarks,":[160],"SRPO":[161],"consistently":[162],"outperforms":[163],"standard":[164],"GRPO":[165],"RRPO":[167],"sampling":[169],"multiple":[170],"suffix":[171],"continuations":[172],"self-localized":[175],"their":[180],"rewards,":[181],"using":[182],"only":[183],"itself":[186],"no":[188],"external":[189],"supervision.":[190]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-27T00:00:00"}
