{"id":"https://openalex.org/W7154481905","doi":"https://doi.org/10.48550/arxiv.2604.13010","title":"Lightning OPD: Efficient Post-Training for Large Reasoning Models with Offline On-Policy Distillation","display_name":"Lightning OPD: Efficient Post-Training for Large Reasoning Models with Offline On-Policy Distillation","publication_year":2026,"publication_date":"2026-04-14","ids":{"openalex":"https://openalex.org/W7154481905","doi":"https://doi.org/10.48550/arxiv.2604.13010"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.13010","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13010","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.13010","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133672726","display_name":"Yecheng Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wu, Yecheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133717150","display_name":"Song Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Song","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133637814","display_name":"Hai Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Hai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5133672726"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.19529999792575836,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.19529999792575836,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1891999989748001,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.09650000184774399,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.6223999857902527},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5921000242233276},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.5159000158309937},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4846000075340271},{"id":"https://openalex.org/keywords/lightning","display_name":"Lightning (connector)","score":0.4821000099182129},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.4560999870300293},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4397999942302704},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.4383000135421753}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6567999720573425},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.6223999857902527},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5921000242233276},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.5159000158309937},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4846000075340271},{"id":"https://openalex.org/C69398868","wikidata":"https://www.wikidata.org/wiki/Q129052","display_name":"Lightning (connector)","level":3,"score":0.4821000099182129},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.4560999870300293},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4397999942302704},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.4383000135421753},{"id":"https://openalex.org/C34388435","wikidata":"https://www.wikidata.org/wiki/Q2267362","display_name":"Bounded function","level":2,"score":0.4372999966144562},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.3968000113964081},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.35339999198913574},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.31690001487731934},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2903999984264374},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.28209999203681946},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C2780102126","wikidata":"https://www.wikidata.org/wiki/Q10928179","display_name":"Online and offline","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25760000944137573},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.25540000200271606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.13010","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13010","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.13010","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13010","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.5144059062004089,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"On-policy":[0],"distillation":[1,112],"(OPD)":[2],"is":[3,229],"an":[4,109,148,182],"effective":[5],"post-training":[6],"paradigm":[7],"for":[8,79,95,122,221],"large":[9],"language":[10],"models":[11],"but":[12],"requires":[13],"a":[14,64,89,123,213],"live":[15,124],"teacher":[16,35,70,76,116,125,132],"server":[17,126],"throughout":[18],"training,":[19],"resulting":[20],"in":[21,193],"substantial":[22],"infrastructure":[23],"overhead.":[24],"We":[25,46,128],"investigate":[26],"whether":[27],"OPD":[28,135,167,173,187,199],"can":[29],"be":[30,77],"performed":[31],"offline":[32,97,110],"by":[33],"precomputing":[34],"log-probabilities":[36],"once":[37],"over":[38],"SFT":[39],"rollouts":[40],"and":[41,58,83,98,118,147,161],"reusing":[42],"them":[43],"during":[44],"training.":[45],"find":[47],"that":[48,73,92,114,152,165],"naively":[49],"doing":[50],"so":[51],"fails":[52],"to":[53,63,171,202,207],"reliably":[54],"match":[55],"standard":[56,141,172],"OPD,":[57,108,142],"trace":[59],"the":[60,74,120,137,219],"root":[61],"cause":[62],"previously":[65],"overlooked":[66],"condition":[67,87],"we":[68,105],"term":[69],"consistency,":[71,133],"requiring":[72],"same":[75,138],"used":[78],"both":[80,96],"supervised":[81],"fine-tuning":[82],"OPD.":[84,100],"Violating":[85],"this":[86,103],"introduces":[88],"gradient":[90,145],"bias":[91],"degrades":[93],"performance":[94,170],"online":[99],"Building":[101],"on":[102,158,190,209,212,224],"insight,":[104],"propose":[106],"Lightning":[107,134,166,186,198],"on-policy":[111],"framework":[113],"enforces":[115],"consistency":[117],"eliminates":[119],"need":[121],"entirely.":[127],"prove":[129],"that,":[130],"under":[131],"shares":[136],"optimum":[139],"as":[140],"with":[143],"bounded":[144],"discrepancy":[146],"implicit":[149],"regularization":[150],"effect":[151],"helps":[153],"prevent":[154],"policy":[155],"drift.":[156],"Experiments":[157],"math":[159],"reasoning":[160],"code":[162,228],"generation":[163],"show":[164],"achieves":[168],"comparable":[169],"while":[174],"delivering":[175],"4.0x":[176],"higher":[177],"training":[178,205],"efficiency.":[179],"Starting":[180],"from":[181],"SFT-initialized":[183],"Qwen3-8B-Base":[184],"model,":[185],"reaches":[188],"69.9%":[189],"AIME":[191,210],"2024":[192,211],"just":[194],"30":[195],"GPU":[196],"hours.":[197],"further":[200],"scales":[201],"MoE":[203],"architectures,":[204],"Qwen3-30B-A3B":[206],"71.0%":[208],"single":[214],"8xH100":[215],"node,":[216],"substantially":[217],"lowering":[218],"barrier":[220],"academic":[222],"research":[223],"LLM":[225],"post-training.":[226],"Our":[227],"released":[230],"at":[231],"https://github.com/jet-ai-projects/Lightning-OPD.":[232]},"counts_by_year":[],"updated_date":"2026-05-12T06:07:45.972803","created_date":"2026-04-16T00:00:00"}
