{"id":"https://openalex.org/W7159640600","doi":"https://doi.org/10.48550/arxiv.2604.28123","title":"Beyond SFT-to-RL: Pre-alignment via Black-Box On-Policy Distillation for Multimodal RL","display_name":"Beyond SFT-to-RL: Pre-alignment via Black-Box On-Policy Distillation for Multimodal RL","publication_year":2026,"publication_date":"2026-04-30","ids":{"openalex":"https://openalex.org/W7159640600","doi":"https://doi.org/10.48550/arxiv.2604.28123"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.28123","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.28123","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.28123","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134994300","display_name":"Sudong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Sudong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134945293","display_name":"Weiquan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Weiquan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134943519","display_name":"Xiaomin Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Xiaomin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088457247","display_name":"Zuhao Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zuhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134949545","display_name":"Hehai Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Hehai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134932806","display_name":"Keming Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Keming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014699953","display_name":"Chaojun Xiao","orcid":"https://orcid.org/0000-0001-6039-0942"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao, Chaojun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134935055","display_name":"Chen Chen (6544)","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134946641","display_name":"Wenxuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Wenxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009738035","display_name":"Beier Zhu","orcid":"https://orcid.org/0000-0002-7900-6979"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Beier","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134942393","display_name":"Yunjian Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yunjian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134983502","display_name":"Chengwei Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Chengwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.3384000062942505,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.3384000062942505,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.15530000627040863,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.10170000046491623,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminator","display_name":"Discriminator","score":0.6366999745368958},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.578000009059906},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.48170000314712524},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.47940000891685486},{"id":"https://openalex.org/keywords/obstacle","display_name":"Obstacle","score":0.47929999232292175},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.4742000102996826},{"id":"https://openalex.org/keywords/prism","display_name":"Prism","score":0.4320000112056732},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.4122999906539917}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6998999714851379},{"id":"https://openalex.org/C2779803651","wikidata":"https://www.wikidata.org/wiki/Q5282088","display_name":"Discriminator","level":3,"score":0.6366999745368958},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.578000009059906},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5436999797821045},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.48170000314712524},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.47940000891685486},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.47929999232292175},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.4742000102996826},{"id":"https://openalex.org/C67666897","wikidata":"https://www.wikidata.org/wiki/Q165896","display_name":"Prism","level":2,"score":0.4320000112056732},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.4122999906539917},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4075999855995178},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3700999915599823},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.3617999851703644},{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.359499990940094},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.34860000014305115},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.3359000086784363},{"id":"https://openalex.org/C30905978","wikidata":"https://www.wikidata.org/wiki/Q815598","display_name":"Metering mode","level":2,"score":0.32600000500679016},{"id":"https://openalex.org/C102634674","wikidata":"https://www.wikidata.org/wiki/Q868473","display_name":"Smoothness","level":2,"score":0.3102000057697296},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.3084000051021576},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.27079999446868896},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2567000091075897}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.28123","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.28123","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.28123","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.28123","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.6281533241271973,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"standard":[1],"post-training":[2],"recipe":[3],"for":[4,139],"large":[5],"multimodal":[6,48,190],"models":[7],"(LMMs)":[8],"applies":[9],"supervised":[10],"fine-tuning":[11],"(SFT)":[12],"on":[13,86,165,171,204],"curated":[14],"demonstrations":[15,137,153],"followed":[16],"by":[17,75,195],"reinforcement":[18],"learning":[19],"with":[20,110],"verifiable":[21],"rewards":[22],"(RLVR).":[23],"However,":[24],"SFT":[25,82,141],"introduces":[26],"distributional":[27],"drift":[28,58,74],"that":[29,60,71,120,174],"neither":[30],"preserves":[31],"the":[32,39,87,103,122,125,166,201],"model's":[33],"original":[34],"capabilities":[35],"nor":[36],"faithfully":[37],"matches":[38],"supervision":[40,126],"distribution.":[41],"This":[42],"problem":[43],"is":[44],"further":[45],"amplified":[46],"in":[47],"reasoning,":[49],"where":[50],"perception":[51,112],"errors":[52],"and":[53,83,105,113,162,188,197,206,212],"reasoning":[54,114,164],"failures":[55],"follow":[56],"distinct":[57],"patterns":[59],"compound":[61],"during":[62],"subsequent":[63],"RL.":[64],"We":[65],"introduce":[66],"PRISM,":[67],"a":[68,97,106],"three-stage":[69],"pipeline":[70],"mitigates":[72],"this":[73],"inserting":[76],"an":[77],"explicit":[78],"distribution-alignment":[79],"stage":[80],"between":[81,102],"RLVR.":[84],"Building":[85],"principle":[88],"of":[89],"on-policy":[90],"distillation":[91],"(OPD),":[92],"PRISM":[93,175],"casts":[94],"alignment":[95,144],"as":[96],"black-box,":[98],"response-level":[99],"adversarial":[100],"game":[101],"policy":[104,123],"Mixture-of-Experts":[107],"(MoE)":[108],"discriminator":[109],"dedicated":[111],"experts,":[115],"providing":[116],"disentangled":[117],"corrective":[118],"signals":[119],"steer":[121],"toward":[124],"distribution":[127,143],"without":[128],"requiring":[129],"access":[130],"to":[131],"teacher":[132],"logits.":[133],"While":[134],"1.26M":[135],"public":[136],"suffice":[138],"broad":[140],"initialization,":[142],"demands":[145],"higher-fidelity":[146],"supervision;":[147],"we":[148],"therefore":[149],"curate":[150],"113K":[151],"additional":[152],"from":[154],"Gemini":[155],"3":[156],"Flash,":[157],"featuring":[158],"dense":[159],"visual":[160],"grounding":[161],"step-by-step":[163],"hardest":[167],"unsolved":[168],"problems.":[169],"Experiments":[170],"Qwen3-VL":[172],"show":[173],"consistently":[176],"improves":[177],"downstream":[178],"RLVR":[179],"performance":[180],"across":[181],"multiple":[182],"RL":[183],"algorithms":[184],"(GRPO,":[185],"DAPO,":[186],"GSPO)":[187],"diverse":[189],"benchmarks,":[191],"improving":[192],"average":[193],"accuracy":[194],"+4.4":[196],"+6.0":[198],"points":[199],"over":[200],"SFT-to-RLVR":[202],"baseline":[203],"4B":[205],"8B,":[207],"respectively.":[208],"Our":[209],"code,":[210],"data,":[211],"model":[213],"checkpoints":[214],"are":[215],"publicly":[216],"available":[217],"at":[218],"https://github.com/XIAO4579/PRISM.":[219]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-02T00:00:00"}
