{"id":"https://openalex.org/W7152432596","doi":"https://doi.org/10.48550/arxiv.2604.06916","title":"FP4 Explore, BF16 Train: Diffusion Reinforcement Learning via Efficient Rollout Scaling","display_name":"FP4 Explore, BF16 Train: Diffusion Reinforcement Learning via Efficient Rollout Scaling","publication_year":2026,"publication_date":"2026-04-08","ids":{"openalex":"https://openalex.org/W7152432596","doi":"https://doi.org/10.48550/arxiv.2604.06916"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.06916","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06916","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.06916","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133287694","display_name":"Yitong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yitong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133312760","display_name":"Junsong Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Junsong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133298159","display_name":"Shuchen Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Shuchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133283510","display_name":"Pengcuo Zeren","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeren, Pengcuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133297012","display_name":"Siyuan Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Siyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133303651","display_name":"Dinghao Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Dinghao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133285212","display_name":"Yangyang Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Yangyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080338808","display_name":"Junjie Bai","orcid":"https://orcid.org/0009-0002-4280-3648"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Junjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133243235","display_name":"Ping Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Ping","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133307728","display_name":"Song Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Song","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133302653","display_name":"Enze Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Enze","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.17499999701976776,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.17499999701976776,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.11800000071525574,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.09929999709129333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6342999935150146},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.6086000204086304},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5771999955177307},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.4982999861240387},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.4790000021457672},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.3718000054359436}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7911999821662903},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6342999935150146},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.6086000204086304},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5771999955177307},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.4982999861240387},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.4790000021457672},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.3718000054359436},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3630000054836273},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3246000111103058},{"id":"https://openalex.org/C205606062","wikidata":"https://www.wikidata.org/wiki/Q5249645","display_name":"Decoupling (probability)","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.3131999969482422},{"id":"https://openalex.org/C175309249","wikidata":"https://www.wikidata.org/wiki/Q725864","display_name":"Pipeline transport","level":2,"score":0.299699991941452},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.29010000824928284},{"id":"https://openalex.org/C2778496695","wikidata":"https://www.wikidata.org/wiki/Q254128","display_name":"Dilemma","level":2,"score":0.27000001072883606},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.26460000872612}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.06916","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06916","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.06916","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06916","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement-Learning-based":[0],"post-training":[1],"has":[2],"recently":[3],"emerged":[4],"as":[5],"a":[6,47,93,108,114,229],"promising":[7],"paradigm":[8],"for":[9,32,169],"aligning":[10],"text-to-image":[11],"diffusion":[12,42],"models":[13,43],"with":[14,149],"human":[15],"preferences.":[16],"In":[17],"recent":[18],"studies,":[19],"increasing":[20],"the":[21,57,129,143,150,162,178,188,222,232],"rollout":[22,147,163,226],"group":[23],"size":[24],"yields":[25],"pronounced":[26],"performance":[27,77,209],"improvements,":[28],"indicating":[29],"substantial":[30],"room":[31],"further":[33],"alignment":[34,208],"gains.":[35],"However,":[36],"scaling":[37,148,227],"rollouts":[38,105],"on":[39,132],"large-scale":[40],"foundational":[41],"(e.g.,":[44],"FLUX.1-12B)":[45],"imposes":[46],"heavy":[48],"computational":[49],"burden.":[50],"To":[51,79],"alleviate":[52],"this":[53,81],"bottleneck,":[54],"we":[55,67,88,101,119],"explore":[56],"integration":[58],"of":[59,76,146,154,181,224,231],"FP4":[60,193],"quantization":[61],"into":[62],"Diffusion":[63],"RL":[64],"rollouts.":[65],"Yet,":[66],"identify":[68],"that":[69,174,203],"naive":[70],"quantized":[71],"pipelines":[72],"inherently":[73],"introduce":[74],"risks":[75],"degradation.":[78],"overcome":[80],"dilemma":[82],"between":[83],"efficiency":[84],"and":[85,112,127,200],"training":[86,179,215],"integrity,":[87],"propose":[89],"Sol-RL":[90,141],"(Speed-of-light":[91],"RL),":[92],"novel":[94],"FP4-empowered":[95],"Two-stage":[96],"Reinforcement":[97],"Learning":[98],"framework.":[99],"First,":[100],"utilize":[102],"high-throughput":[103],"NVFP4":[104],"to":[106,219],"generate":[107],"massive":[109,225],"candidate":[110,136],"pool":[111],"extract":[113],"highly":[115],"contrastive":[116],"subset.":[117],"Second,":[118],"regenerate":[120],"these":[121],"selected":[122],"samples":[123,168],"in":[124],"BF16":[125,182],"precision":[126,183],"optimize":[128],"policy":[130,139],"exclusively":[131],"them.":[133],"By":[134],"decoupling":[135],"exploration":[137],"from":[138],"optimization,":[140],"integrates":[142],"algorithmic":[144],"mechanisms":[145],"system-level":[151],"throughput":[152,189],"gains":[153,190],"NVFP4.":[155],"This":[156],"synergistic":[157],"algorithm-hardware":[158],"design":[159],"effectively":[160],"accelerates":[161],"phase":[164],"while":[165,185,213],"reserving":[166],"high-fidelity":[167],"optimization.":[170],"We":[171],"empirically":[172],"demonstrate":[173],"our":[175,204],"framework":[176],"maintains":[177],"integrity":[180],"pipeline":[184],"fully":[186],"exploiting":[187],"enabled":[191],"by":[192,217],"arithmetic.":[194],"Extensive":[195],"experiments":[196],"across":[197,210],"SANA,":[198],"FLUX.1,":[199],"SD3.5-L":[201],"substantiate":[202],"approach":[205],"delivers":[206],"superior":[207],"multiple":[211],"metrics":[212],"accelerating":[214],"convergence":[216],"up":[218],"$4.64\\times$,":[220],"unlocking":[221],"power":[223],"at":[228],"fraction":[230],"cost.":[233]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-10T00:00:00"}
