{"id":"https://openalex.org/W7162075892","doi":"https://doi.org/10.48550/arxiv.2605.22217","title":"Survive or Collapse: The Asymmetric Roles of Data Gating and Reward Grounding in Self-Play RL","display_name":"Survive or Collapse: The Asymmetric Roles of Data Gating and Reward Grounding in Self-Play RL","publication_year":2026,"publication_date":"2026-05-21","ids":{"openalex":"https://openalex.org/W7162075892","doi":"https://doi.org/10.48550/arxiv.2605.22217"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.22217","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.22217","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.22217","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120308982","display_name":"Sophia Xiao Pu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pu, Sophia Xiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136734124","display_name":"Zhaotian Weng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weng, Zhaotian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136737666","display_name":"Chengzhi Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Chengzhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136773942","display_name":"Jayanth Srinivasa","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Srinivasa, Jayanth","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136773697","display_name":"Gaowen Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Gaowen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136767451","display_name":"William W. B. Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, William Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136763504","display_name":"Xin Eric Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xin Eric","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.29739999771118164,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.29739999771118164,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.07840000092983246,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.051100000739097595,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spurious-relationship","display_name":"Spurious relationship","score":0.4691999852657318},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.46549999713897705},{"id":"https://openalex.org/keywords/gating","display_name":"Gating","score":0.44119998812675476},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.44119998812675476},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.4097000062465668},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.38510000705718994},{"id":"https://openalex.org/keywords/corollary","display_name":"Corollary","score":0.3610000014305115},{"id":"https://openalex.org/keywords/satisfiability-modulo-theories","display_name":"Satisfiability modulo theories","score":0.34769999980926514}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6748999953269958},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.4691999852657318},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.46549999713897705},{"id":"https://openalex.org/C194544171","wikidata":"https://www.wikidata.org/wiki/Q21105679","display_name":"Gating","level":2,"score":0.44119998812675476},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.44119998812675476},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.4097000062465668},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.38510000705718994},{"id":"https://openalex.org/C2780012671","wikidata":"https://www.wikidata.org/wiki/Q1343870","display_name":"Corollary","level":2,"score":0.3610000014305115},{"id":"https://openalex.org/C164155591","wikidata":"https://www.wikidata.org/wiki/Q2067766","display_name":"Satisfiability modulo theories","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.34459999203681946},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.33660000562667847},{"id":"https://openalex.org/C2778770139","wikidata":"https://www.wikidata.org/wiki/Q1966904","display_name":"Solver","level":2,"score":0.3296000063419342},{"id":"https://openalex.org/C2778000800","wikidata":"https://www.wikidata.org/wiki/Q830043","display_name":"Handshake","level":3,"score":0.29840001463890076},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.29089999198913574},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.28850001096725464},{"id":"https://openalex.org/C190839683","wikidata":"https://www.wikidata.org/wiki/Q2448197","display_name":"Train","level":2,"score":0.26409998536109924},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.2583000063896179},{"id":"https://openalex.org/C131584629","wikidata":"https://www.wikidata.org/wiki/Q4308705","display_name":"Coupling (piping)","level":2,"score":0.2565000057220459}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.22217","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.22217","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.22217","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.22217","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Self-play":[0],"reinforcement":[1],"learning":[2],"trains":[3],"language":[4],"models":[5],"on":[6,76,83,176,229],"their":[7],"own":[8],"generated":[9],"tasks,":[10],"co-evolving":[11],"a":[12,41,56,84,89,123,146,155,170,185,194,201],"proposer":[13,156],"and":[14,27,32,68,88,99],"solver":[15],"without":[16],"human":[17],"labels.":[18],"Recent":[19],"systems":[20],"report":[21],"strong":[22],"reasoning":[23],"gains,":[24],"but":[25],"collapse":[26,161],"instability":[28],"are":[29,107],"widely":[30],"observed":[31],"poorly":[33],"understood.":[34],"The":[35],"dominant":[36],"response":[37],"treats":[38],"this":[39],"as":[40],"reward-design":[42],"problem.":[43],"We":[44],"argue":[45],"instead":[46],"that":[47,59,72,93,179],"self-play":[48,230],"stability":[49,115],"is":[50,112,136,141,217,225],"governed":[51],"by":[52,173],"two":[53,105],"distinct":[54],"levers:":[55],"data-level":[57],"gate":[58,111,140,192],"decides":[60],"which":[61],"proposer-generated":[62],"tasks":[63,77,178],"enter":[64],"the":[65,69,74,104,139,151,181,190,226],"training":[66,175],"pool,":[67],"reward":[70,118,125,134,223],"signal":[71],"updates":[73],"policy":[75],"already":[78],"admitted.":[79],"Through":[80],"controlled":[81],"experiments":[82],"Python":[85],"output-prediction":[86],"task":[87,92],"deterministic-DSL":[90],"twin":[91],"strips":[94],"pretraining":[95],"priors,":[96],"output":[97],"ambiguity,":[98],"executor":[100],"noise,":[101],"we":[102,120,149],"find":[103],"levers":[106],"asymmetric.":[108],"A":[109],"strict":[110],"sufficient":[113,137],"for":[114],"under":[116],"every":[117],"variant":[119,135],"test,":[121],"including":[122],"self-consistency":[124,171],"with":[126,157,169,193],"no":[127,133],"access":[128,159],"to":[129,184],"ground":[130],"truth;":[131],"while":[132,211],"once":[138],"removed.":[142],"This":[143],"asymmetry":[144],"exposes":[145],"counter-intuitive":[147],"coupling":[148],"call":[150],"Grounded":[152],"Proposer":[153],"Paradox:":[154],"ground-truth":[158],"accelerates":[160],"faster":[162],"than":[163],"an":[164],"ungrounded":[165],"one":[166],"when":[167],"paired":[168],"solver,":[172],"concentrating":[174],"clean":[177],"form":[180],"fastest":[182],"path":[183],"spurious":[186],"self-consistent":[187],"attractor.":[188],"Replacing":[189],"binary":[191],"continuous":[195],"strictness":[196],"parameter":[197],"$\\varepsilon$":[198,216],"further":[199],"reveals":[200],"two-stage":[202],"phase":[203],"transition:":[204],"training-side":[205],"metrics":[206],"decouple":[207],"at":[208],"low":[209],"$\\varepsilon$,":[210],"validation":[212],"accuracy":[213],"holds":[214],"until":[215],"much":[218],"higher.":[219],"Data-level":[220],"gating,":[221],"not":[222],"calibration,":[224],"binding":[227],"constraint":[228],"stability.":[231]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-23T00:00:00"}
