{"id":"https://openalex.org/W7130361034","doi":"https://doi.org/10.48550/arxiv.2602.15620","title":"STAPO: Stabilizing Reinforcement Learning for LLMs by Silencing Rare Spurious Tokens","display_name":"STAPO: Stabilizing Reinforcement Learning for LLMs by Silencing Rare Spurious Tokens","publication_year":2026,"publication_date":"2026-02-17","ids":{"openalex":"https://openalex.org/W7130361034","doi":"https://doi.org/10.48550/arxiv.2602.15620"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.15620","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.15620","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.15620","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126297608","display_name":"Shiqi Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Shiqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126324925","display_name":"Zeyu He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Zeyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029301375","display_name":"Guojian Zhan","orcid":"https://orcid.org/0000-0002-1246-4860"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhan, Guojian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126303381","display_name":"Letian Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Letian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005752974","display_name":"Zhilong Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Zhilong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126318677","display_name":"Jiang Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Jiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126335955","display_name":"Yinuo Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yinuo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126298999","display_name":"Yang Guan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016509766","display_name":"Kehua Sheng","orcid":"https://orcid.org/0009-0008-3370-7711"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sheng, Kehua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126308886","display_name":"Bo Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Bo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126327439","display_name":"Keqiang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Keqiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067909017","display_name":"Jingliang Duan","orcid":"https://orcid.org/0000-0002-3697-1576"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Duan, Jingliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126313021","display_name":"Shengbo Eben Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shengbo Eben","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5126297608"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2531999945640564,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2531999945640564,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.1386999934911728,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.10920000076293945,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spurious-relationship","display_name":"Spurious relationship","score":0.940500020980835},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7236999869346619},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6549999713897705},{"id":"https://openalex.org/keywords/entropy","display_name":"Entropy (arrow of time)","score":0.542900025844574},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.45750001072883606},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.4560000002384186},{"id":"https://openalex.org/keywords/reinforcement","display_name":"Reinforcement","score":0.3725999891757965}],"concepts":[{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.940500020980835},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7236999869346619},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6549999713897705},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6207000017166138},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.542900025844574},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.45750001072883606},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.4560000002384186},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4196000099182129},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.3725999891757965},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.34619998931884766},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C9679016","wikidata":"https://www.wikidata.org/wiki/Q1417473","display_name":"Principle of maximum entropy","level":2,"score":0.30169999599456787},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.30140000581741333},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.2964000105857849},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27730000019073486},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.26179999113082886},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.15620","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.15620","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.15620","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.15620","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement":[0],"Learning":[1],"(RL)":[2],"has":[3],"significantly":[4],"improved":[5],"large":[6],"language":[7],"model":[8,170],"reasoning,":[9],"but":[10,103],"existing":[11],"RL":[12,57],"fine-tuning":[13],"methods":[14],"rely":[15],"heavily":[16],"on":[17],"heuristic":[18],"techniques":[19],"such":[20,90],"as":[21],"entropy":[22,189],"regularization":[23],"and":[24,43,64,140,143,167,181,191,201,208],"reweighting":[25],"to":[26,39,99,110,127],"maintain":[27],"stability.":[28],"In":[29],"practice,":[30],"they":[31,96],"often":[32],"suffer":[33],"from":[34],"late-stage":[35],"performance":[36,195],"collapse,":[37],"leading":[38,109],"degraded":[40],"reasoning":[41,101,175],"quality":[42],"unstable":[44],"training.":[45],"Our":[46],"analysis":[47],"shows":[48],"that":[49,70],"the":[50,100,105],"magnitude":[51],"of":[52,80,197],"token-wise":[53],"policy":[54,66],"gradients":[55],"in":[56,93],"is":[58],"negatively":[59],"correlated":[60],"with":[61,135],"token":[62],"probability":[63],"local":[65],"entropy.":[67],"We":[68],"find":[69],"training":[71],"instability":[72],"can":[73],"be":[74],"caused":[75],"by":[76],"a":[77,155],"tiny":[78],"fraction":[79],"tokens,":[81],"approximately":[82],"0.01%,":[83],"which":[84,164],"we":[85,119,158],"term":[86],"spurious":[87,124,130],"tokens.":[88],"When":[89],"tokens":[91,131],"appear":[92],"correct":[94],"responses,":[95],"contribute":[97],"little":[98],"outcome":[102],"inherit":[104],"full":[106],"sequence-level":[107],"reward,":[108],"abnormally":[111],"amplified":[112],"gradient":[113,147],"updates.":[114],"To":[115],"mitigate":[116],"this":[117,152],"instability,":[118],"design":[120],"an":[121,193],"S2T":[122],"(silencing":[123],"tokens)":[125],"mechanism":[126,153],"efficiently":[128],"identify":[129],"through":[132],"characteristic":[133],"signals":[134],"low":[136,138],"probability,":[137],"entropy,":[139],"positive":[141],"advantage,":[142],"then":[144],"suppress":[145],"their":[146],"perturbations":[148],"during":[149],"optimization.":[150],"Incorporating":[151],"into":[154],"group-based":[156],"objective,":[157],"propose":[159],"Spurious-Token-Aware":[160],"Policy":[161],"Optimization":[162],"(STAPO),":[163],"promotes":[165],"stable":[166],"effective":[168],"large-scale":[169],"refinement.":[171],"Across":[172],"six":[173],"mathematical":[174],"benchmarks":[176],"using":[177],"Qwen":[178],"1.7B,":[179],"8B,":[180],"14B":[182],"base":[183],"models,":[184],"STAPO":[185],"consistently":[186],"demonstrates":[187],"superior":[188],"stability":[190],"achieves":[192],"average":[194],"improvement":[196],"7.13%":[198],"($\u03c1_{\\mathrm{T}}$=1.0,":[199],"top-p=1.0)":[200],"3.69%":[202],"($\u03c1_{\\mathrm{T}}$=0.7,":[203],"top-p=0.9)":[204],"over":[205],"GRPO,":[206],"20-Entropy,":[207],"JustRL.":[209]},"counts_by_year":[],"updated_date":"2026-02-25T06:17:34.324206","created_date":"2026-02-19T00:00:00"}
