{"id":"https://openalex.org/W7148703720","doi":"https://doi.org/10.48550/arxiv.2604.02288","title":"Unifying Group-Relative and Self-Distillation Policy Optimization via Sample Routing","display_name":"Unifying Group-Relative and Self-Distillation Policy Optimization via Sample Routing","publication_year":2026,"publication_date":"2026-04-02","ids":{"openalex":"https://openalex.org/W7148703720","doi":"https://doi.org/10.48550/arxiv.2604.02288"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.02288","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02288","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.02288","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5007112222","display_name":"Gengsheng S. Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Gengsheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132871422","display_name":"Tianyu Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Tianyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132846849","display_name":"Junfeng Fang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Junfeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132902171","display_name":"Mingyang Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Mingyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132853626","display_name":"Mao Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Mao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132876941","display_name":"Haiyun Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Haiyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132894624","display_name":"Dan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Dan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132870020","display_name":"Jinqiao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jinqiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132844178","display_name":"Tat-Seng Chua","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chua, Tat-Seng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5007112222"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.1559000015258789,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.1559000015258789,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.12240000069141388,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.10429999977350235,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7968999743461609},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.6298999786376953},{"id":"https://openalex.org/keywords/weighting","display_name":"Weighting","score":0.5982999801635742},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.5616000294685364},{"id":"https://openalex.org/keywords/trace","display_name":"TRACE (psycholinguistics)","score":0.5285999774932861},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.5163000226020813},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5080999732017517},{"id":"https://openalex.org/keywords/routing","display_name":"Routing (electronic design automation)","score":0.4449999928474426}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7968999743461609},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6819999814033508},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.6298999786376953},{"id":"https://openalex.org/C183115368","wikidata":"https://www.wikidata.org/wiki/Q856577","display_name":"Weighting","level":2,"score":0.5982999801635742},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.5616000294685364},{"id":"https://openalex.org/C75291252","wikidata":"https://www.wikidata.org/wiki/Q1315756","display_name":"TRACE (psycholinguistics)","level":2,"score":0.5285999774932861},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.5163000226020813},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5080999732017517},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.44589999318122864},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.4449999928474426},{"id":"https://openalex.org/C111696304","wikidata":"https://www.wikidata.org/wiki/Q2303697","display_name":"Sorting","level":2,"score":0.4047999978065491},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.3662000000476837},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.35429999232292175},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.30079999566078186},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.28380000591278076},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26080000400543213},{"id":"https://openalex.org/C2779795794","wikidata":"https://www.wikidata.org/wiki/Q7315343","display_name":"Reset (finance)","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.25429999828338623}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.02288","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02288","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.02288","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02288","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement":[0],"learning":[1],"with":[2],"verifiable":[3],"rewards":[4],"(RLVR)":[5],"has":[6],"become":[7],"a":[8,101],"standard":[9],"paradigm":[10],"for":[11],"post-training":[12],"large":[13],"language":[14],"models.":[15],"While":[16],"Group":[17],"Relative":[18],"Policy":[19,44,98],"Optimization":[20,45,99],"(GRPO)":[21],"is":[22],"widely":[23],"adopted,":[24],"its":[25],"coarse":[26],"credit":[27],"assignment":[28],"uniformly":[29],"penalizes":[30],"failed":[31,114],"rollouts,":[32],"lacking":[33],"the":[34,85,150,157,165,172],"token-level":[35],"focus":[36],"needed":[37],"to":[38,73,109,116,129,198],"efficiently":[39],"address":[40],"specific":[41],"deviations.":[42],"Self-Distillation":[43],"(SDPO)":[46],"addresses":[47],"this":[48,70],"by":[49,177,196],"providing":[50],"denser,":[51],"more":[52],"targeted":[53,118],"logit-level":[54,119],"supervision":[55],"that":[56,105],"facilitates":[57],"rapid":[58,151],"early":[59,152],"improvement,":[60],"yet":[61],"it":[62],"frequently":[63],"collapses":[64],"during":[65],"prolonged":[66],"training.":[67],"We":[68],"trace":[69],"late-stage":[71],"instability":[72],"two":[74,144],"intrinsic":[75],"flaws:":[76],"self-distillation":[77],"on":[78,175],"already-correct":[79],"samples":[80,108,115],"introduces":[81],"optimization":[82],"ambiguity,":[83],"and":[84,113,143,156,181,191],"self-teacher's":[86],"signal":[87],"reliability":[88],"progressively":[89],"degrades.":[90],"To":[91],"resolve":[92],"these":[93],"issues,":[94],"we":[95],"propose":[96],"Sample-Routed":[97],"(SRPO),":[100],"unified":[102],"on-policy":[103],"framework":[104],"routes":[106],"correct":[107],"GRPO's":[110],"reward-aligned":[111],"reinforcement":[112],"SDPO's":[117],"correction.":[120],"SRPO":[121,147],"further":[122],"incorporates":[123],"an":[124],"entropy-aware":[125],"dynamic":[126],"weighting":[127],"mechanism":[128],"suppress":[130],"high-entropy,":[131],"unreliable":[132],"distillation":[133],"targets":[134],"while":[135,185],"emphasizing":[136],"confident":[137],"ones.":[138],"Evaluated":[139],"across":[140],"five":[141],"benchmarks":[142],"model":[145],"scales,":[146],"achieves":[148],"both":[149,169],"improvement":[153],"of":[154,160,168],"SDPO":[155],"long-horizon":[158],"stability":[159],"GRPO.":[161],"It":[162],"consistently":[163],"surpasses":[164],"peak":[166],"performance":[167],"baselines,":[170],"raising":[171],"five-benchmark":[173],"average":[174],"Qwen3-8B":[176],"3.4%":[178],"over":[179,183],"GRPO":[180],"6.3%":[182],"SDPO,":[184],"simultaneously":[186],"yielding":[187],"moderate":[188],"response":[189],"lengths":[190],"lowering":[192],"per-step":[193],"compute":[194],"cost":[195],"up":[197],"17.2%.":[199]},"counts_by_year":[],"updated_date":"2026-04-04T06:15:33.020886","created_date":"2026-04-04T00:00:00"}
