{"id":"https://openalex.org/W7148875433","doi":"https://doi.org/10.48550/arxiv.2604.01591","title":"ThinkTwice: Jointly Optimizing Large Language Models for Reasoning and Self-Refinement","display_name":"ThinkTwice: Jointly Optimizing Large Language Models for Reasoning and Self-Refinement","publication_year":2026,"publication_date":"2026-04-02","ids":{"openalex":"https://openalex.org/W7148875433","doi":"https://doi.org/10.48550/arxiv.2604.01591"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.01591","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.01591","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.01591","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132850350","display_name":"Difan Jiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiao, Difan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111351135","display_name":"Q. G. Wen","orcid":"https://orcid.org/0000-0002-5556-9419"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen, Qianfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132858582","display_name":"Blair Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Blair","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132834486","display_name":"Zhenwei Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Zhenwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129659352","display_name":"Ashton Anderson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Anderson, Ashton","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2924000024795532,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2924000024795532,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.16220000386238098,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.06790000200271606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.8634999990463257},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.558899998664856},{"id":"https://openalex.org/keywords/binary-number","display_name":"Binary number","score":0.47380000352859497},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.4146000146865845},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.38769999146461487},{"id":"https://openalex.org/keywords/structured-prediction","display_name":"Structured prediction","score":0.3662000000476837},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.35109999775886536},{"id":"https://openalex.org/keywords/reasoning-system","display_name":"Reasoning system","score":0.34369999170303345}],"concepts":[{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.8634999990463257},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7748000025749207},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6032000184059143},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.558899998664856},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.47380000352859497},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.44830000400543213},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.4146000146865845},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.38769999146461487},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.3662000000476837},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.35109999775886536},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.34369999170303345},{"id":"https://openalex.org/C97364631","wikidata":"https://www.wikidata.org/wiki/Q484284","display_name":"Deductive reasoning","level":2,"score":0.33640000224113464},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.33169999718666077},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.325300008058548},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.3156000077724457},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3133000135421753},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3021000027656555},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.29409998655319214},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.27390000224113464},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.2535000145435333},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.01591","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.01591","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.01591","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.01591","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"introduce":[1],"ThinkTwice,":[2],"a":[3,151,166],"simple":[4],"two-phase":[5],"framework":[6],"that":[7],"jointly":[8],"optimizes":[9,34,42],"LLMs":[10],"to":[11,49],"solve":[12],"reasoning":[13,39,71,85,162],"problems":[14],"and":[15,73,79,86,109,139,163,168],"refine":[16],"the":[17,35,50,54,122,147],"answers,":[18],"based":[19],"on":[20,37,44,96,101],"Group":[21],"Relative":[22],"Policy":[23],"Optimization":[24],"(GRPO).":[25],"In":[26],"each":[27],"pair":[28],"of":[29,121,125,161],"training":[30,123,138,160],"steps,":[31],"ThinkTwice":[32,81,98,126],"first":[33],"model":[36,75,148],"solving":[38],"problems,":[40,52],"then":[41],"it":[43],"refining":[45],"its":[46],"own":[47],"solutions":[48,145],"same":[51,55],"using":[53],"binary":[56],"correctness":[57,63],"reward":[58,154],"in":[59,137],"both":[60,84],"phases":[61],"without":[62],"signals":[64],"or":[65],"critique":[66],"annotations.":[67],"Across":[68],"five":[69],"mathematical":[70],"benchmarks":[72],"two":[74],"families":[76],"including":[77],"Qwen3-4B":[78],"Olmo3-7B,":[80],"substantially":[82],"improves":[83],"refinement":[87,108,132],"performance":[88],"over":[89],"competitive":[90],"online":[91],"policy":[92],"optimization":[93],"baselines.":[94],"Specifically,":[95],"Qwen3-4B,":[97],"outperforms":[99],"GRPO":[100],"AIME":[102],"by":[103,110,118],"5":[104],"percentage":[105],"points":[106,112],"before":[107],"11.5":[111],"after":[113],"one":[114],"self-refinement":[115,164],"step,":[116],"measured":[117],"pass@4.":[119],"Analysis":[120],"dynamics":[124],"reveals":[127],"an":[128],"implicit":[129],"rectify-then-fortify":[130],"curriculum:":[131],"predominantly":[133],"corrects":[134],"errors":[135],"early":[136],"naturally":[140],"shifts":[141],"toward":[142],"preserving":[143],"already-correct":[144],"as":[146,165],"improves,":[149],"yielding":[150],"more":[152],"rectified":[153],"signal.":[155],"Our":[156],"work":[157],"establishes":[158],"joint":[159],"principled":[167],"effective":[169],"methodology":[170],"for":[171],"RLVR.":[172]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-04T00:00:00"}
