{"id":"https://openalex.org/W7148407214","doi":"https://doi.org/10.48550/arxiv.2604.00790","title":"RefineRL: Advancing Competitive Programming with Self-Refinement Reinforcement Learning","display_name":"RefineRL: Advancing Competitive Programming with Self-Refinement Reinforcement Learning","publication_year":2026,"publication_date":"2026-04-01","ids":{"openalex":"https://openalex.org/W7148407214","doi":"https://doi.org/10.48550/arxiv.2604.00790"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00790","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00790","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00790","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124944012","display_name":"Shaopeng Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Shaopeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132822587","display_name":"Xingxing Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xingxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132824881","display_name":"Li Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132799903","display_name":"Di Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Di","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132815845","display_name":"Furu Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Furu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.18000000715255737,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.18000000715255737,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.11190000176429749,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.10869999974966049,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7307999730110168},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6001999974250793},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.4544999897480011},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4251999855041504},{"id":"https://openalex.org/keywords/competitive-advantage","display_name":"Competitive advantage","score":0.37389999628067017},{"id":"https://openalex.org/keywords/test","display_name":"Test (biology)","score":0.34790000319480896},{"id":"https://openalex.org/keywords/skepticism","display_name":"Skepticism","score":0.3264999985694885}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7307999730110168},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6535999774932861},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6001999974250793},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.4544999897480011},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.447299987077713},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4251999855041504},{"id":"https://openalex.org/C58546491","wikidata":"https://www.wikidata.org/wiki/Q1150207","display_name":"Competitive advantage","level":2,"score":0.37389999628067017},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.34790000319480896},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3382999897003174},{"id":"https://openalex.org/C18296254","wikidata":"https://www.wikidata.org/wiki/Q1395219","display_name":"Skepticism","level":2,"score":0.3264999985694885},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.31439998745918274},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C143587482","wikidata":"https://www.wikidata.org/wiki/Q1543216","display_name":"Iterative and incremental development","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.2768000066280365},{"id":"https://openalex.org/C159694833","wikidata":"https://www.wikidata.org/wiki/Q2321565","display_name":"Iterative method","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.2646999955177307},{"id":"https://openalex.org/C117619785","wikidata":"https://www.wikidata.org/wiki/Q6094414","display_name":"Iterative learning control","level":3,"score":0.2574000060558319},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.25360000133514404},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00790","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00790","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00790","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00790","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"large":[1],"language":[2],"models":[3,143,154],"(LLMs)":[4],"have":[5],"demonstrated":[6],"strong":[7],"performance":[8,160],"on":[9,22,125],"complex":[10],"reasoning":[11],"tasks":[12],"such":[13],"as":[14],"competitive":[15],"programming":[16],"(CP),":[17],"existing":[18],"methods":[19],"predominantly":[20],"focus":[21],"single-attempt":[23,159],"settings,":[24],"overlooking":[25],"their":[26,120],"capacity":[27],"for":[28,48,172,179],"iterative":[29,60],"refinement.":[30],"In":[31],"this":[32],"paper,":[33],"we":[34],"present":[35],"RefineRL,":[36],"a":[37,83],"novel":[38],"approach":[39,157],"designed":[40],"to":[41,68,106,109],"unleash":[42],"the":[43,146,158],"self-refinement":[44,61,94,168],"capabilities":[45],"of":[46,76,161],"LLMs":[47,108],"CP":[49,77],"problem":[50],"solving.":[51],"RefineRL":[52],"introduces":[53],"two":[54],"key":[55],"innovations:":[56],"(1)":[57],"Skeptical-Agent,":[58],"an":[59],"agent":[62,80],"equipped":[63],"with":[64,111,119,145,176],"local":[65],"execution":[66],"tools":[67],"validate":[69],"generated":[70],"solutions":[71],"against":[72],"public":[73],"test":[74],"cases":[75],"problems.":[78],"This":[79],"always":[81],"maintains":[82],"skeptical":[84],"attitude":[85],"towards":[86],"its":[87],"own":[88],"outputs":[89],"and":[90,127],"thereby":[91],"enforces":[92],"rigorous":[93],"even":[95],"when":[96],"validation":[97],"suggests":[98],"correctness.":[99],"(2)":[100],"A":[101],"reinforcement":[102],"learning":[103],"(RL)":[104],"solution":[105],"incentivize":[107],"self-refine":[110],"only":[112,149],"standard":[113],"RLVR":[114],"data":[115],"(i.e.,":[116],"problems":[117],"paired":[118],"verifiable":[121],"answers).":[122],"Extensive":[123],"experiments":[124],"Qwen3-4B":[126],"Qwen3-4B-2507":[128],"demonstrate":[129],"that":[130,167],"our":[131,137],"method":[132],"yields":[133],"substantial":[134],"gains:":[135],"after":[136],"RL":[138],"training,":[139],"these":[140],"compact":[141],"4B":[142],"integrated":[144],"Skeptical-Agent":[147],"not":[148],"outperform":[150],"much":[151],"larger":[152],"32B":[153],"but":[155],"also":[156],"235B":[162],"models.":[163],"These":[164],"findings":[165],"suggest":[166],"holds":[169],"considerable":[170],"promise":[171],"scaling":[173],"LLM":[174],"reasoning,":[175],"significant":[177],"potential":[178],"further":[180],"advancement.":[181]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-03T00:00:00"}
