{"id":"https://openalex.org/W7135174508","doi":"https://doi.org/10.48550/arxiv.2603.11137","title":"Scaling Reasoning Efficiently via Relaxed On-Policy Distillation","display_name":"Scaling Reasoning Efficiently via Relaxed On-Policy Distillation","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135174508","doi":"https://doi.org/10.48550/arxiv.2603.11137"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.11137","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11137","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.11137","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5028394311","display_name":"Jongwoo Ko","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ko, Jongwoo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082886672","display_name":"Sara Abdali","orcid":"https://orcid.org/0000-0002-4603-7381"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abdali, Sara","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100387388","display_name":"Young J. Kim","orcid":"https://orcid.org/0000-0003-2159-4832"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Young Jin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128947107","display_name":"Tianyi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Tianyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128965754","display_name":"Pashmina Cameron","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cameron, Pashmina","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5028394311"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.34150001406669617,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.34150001406669617,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.16449999809265137,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.04650000110268593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.6746000051498413},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.631600022315979},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5906999707221985},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.5378999710083008},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.41620001196861267},{"id":"https://openalex.org/keywords/imitation","display_name":"Imitation","score":0.41019999980926514},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.4036000072956085}],"concepts":[{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.6746000051498413},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6419000029563904},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.631600022315979},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5906999707221985},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.5378999710083008},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43309998512268066},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.41620001196861267},{"id":"https://openalex.org/C126388530","wikidata":"https://www.wikidata.org/wiki/Q1131737","display_name":"Imitation","level":2,"score":0.41019999980926514},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.4036000072956085},{"id":"https://openalex.org/C2778445095","wikidata":"https://www.wikidata.org/wiki/Q18354077","display_name":"Sample complexity","level":2,"score":0.39469999074935913},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3946000039577484},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.3264999985694885},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31439998745918274},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.31279999017715454},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2556999921798706}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.11137","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11137","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.11137","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11137","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"On-policy":[0],"distillation":[1,23],"is":[2],"pivotal":[3],"for":[4],"transferring":[5],"reasoning":[6,118,143],"capabilities":[7],"to":[8,14,136],"capacity-constrained":[9],"models,":[10],"yet":[11],"remains":[12],"prone":[13],"instability":[15],"and":[16,29,74,89,106,115,131],"negative":[17],"transfer.":[18],"We":[19],"show":[20],"that":[21,58],"on-policy":[22,69],"can":[24],"be":[25],"interpreted,":[26],"both":[27],"theoretically":[28],"empirically,":[30],"as":[31,43],"a":[32,44,56,90,133,138,145],"form":[33],"of":[34,67],"policy":[35],"optimization,":[36],"where":[37],"the":[38,63,79],"teacher-student":[39],"log-likelihood":[40],"ratio":[41],"acts":[42],"token":[45],"reward.":[46],"From":[47],"this":[48],"insight,":[49],"we":[50],"introduce":[51],"REOPOLD":[52,72,96,121],"(Relaxed":[53],"On-Policy":[54],"Distillation)":[55],"framework":[57],"stabilizes":[59],"optimization":[60],"by":[61],"relaxing":[62],"strict":[64],"imitation":[65],"constraints":[66],"standard":[68],"distillation.":[70],"Specifically,":[71,120],"temperately":[73],"selectively":[75],"leverages":[76],"rewards":[77],"from":[78],"teacher":[80,140],"through":[81],"mixture-based":[82],"reward":[83],"clipping,":[84],"entropy-based":[85],"token-level":[86],"dynamic":[87],"sampling,":[88],"unified":[91],"exploration-to-refinement":[92],"training":[93,105],"strategy.":[94],"Empirically,":[95],"surpasses":[97],"its":[98],"baselines":[99],"with":[100,144],"superior":[101],"sample":[102,129],"efficiency":[103,130],"during":[104],"enhanced":[107],"test-time":[108],"scaling":[109],"at":[110],"inference,":[111],"across":[112],"mathematical,":[113],"visual,":[114],"agentic":[116],"tool-use":[117],"tasks.":[119],"outperforms":[122],"recent":[123],"RL":[124],"approaches":[125],"achieving":[126],"6.7~12x":[127],"greater":[128],"enables":[132],"7B":[134],"student":[135],"match":[137],"32B":[139],"in":[141],"visual":[142],"~3.32x":[146],"inference":[147],"speedup.":[148]},"counts_by_year":[],"updated_date":"2026-03-14T06:46:50.379900","created_date":"2026-03-14T00:00:00"}
