{"id":"https://openalex.org/W7160487445","doi":"https://doi.org/10.48550/arxiv.2605.04960","title":"EP-GRPO: Entropy-Progress Aligned Group Relative Policy Optimization with Implicit Process Guidance","display_name":"EP-GRPO: Entropy-Progress Aligned Group Relative Policy Optimization with Implicit Process Guidance","publication_year":2026,"publication_date":"2026-05-06","ids":{"openalex":"https://openalex.org/W7160487445","doi":"https://doi.org/10.48550/arxiv.2605.04960"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.04960","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04960","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.04960","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135570212","display_name":"Song Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Song","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135566523","display_name":"Li Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071076695","display_name":"Wenwen Zhao","orcid":"https://orcid.org/0000-0001-6019-6680"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Wenwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5058231194","display_name":"Zhisheng Yang","orcid":"https://orcid.org/0000-0003-2458-5114"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhisheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.6963000297546387,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.6963000297546387,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.05270000174641609,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.039000000804662704,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6017000079154968},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.5934000015258789},{"id":"https://openalex.org/keywords/entropy","display_name":"Entropy (arrow of time)","score":0.5809000134468079},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5264999866485596},{"id":"https://openalex.org/keywords/kullback\u2013leibler-divergence","display_name":"Kullback\u2013Leibler divergence","score":0.47999998927116394},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.46470001339912415},{"id":"https://openalex.org/keywords/outcome","display_name":"Outcome (game theory)","score":0.4032999873161316},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.3804999887943268},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.3732999861240387}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6858999729156494},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6017000079154968},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.5934000015258789},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.5809000134468079},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5264999866485596},{"id":"https://openalex.org/C171752962","wikidata":"https://www.wikidata.org/wiki/Q255166","display_name":"Kullback\u2013Leibler divergence","level":2,"score":0.47999998927116394},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.46470001339912415},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.4032999873161316},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.3804999887943268},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.3732999861240387},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.35749998688697815},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34929999709129333},{"id":"https://openalex.org/C28901747","wikidata":"https://www.wikidata.org/wiki/Q177571","display_name":"Decision theory","level":2,"score":0.3483000099658966},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.3395000100135803},{"id":"https://openalex.org/C2984634286","wikidata":"https://www.wikidata.org/wiki/Q1331926","display_name":"Decision process","level":2,"score":0.3366999924182892},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2953999936580658},{"id":"https://openalex.org/C9679016","wikidata":"https://www.wikidata.org/wiki/Q1417473","display_name":"Principle of maximum entropy","level":2,"score":0.29249998927116394},{"id":"https://openalex.org/C2779136372","wikidata":"https://www.wikidata.org/wiki/Q10283002","display_name":"Information flow","level":2,"score":0.2913999855518341},{"id":"https://openalex.org/C38349280","wikidata":"https://www.wikidata.org/wiki/Q1434290","display_name":"Flow (mathematics)","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C106189395","wikidata":"https://www.wikidata.org/wiki/Q176789","display_name":"Markov decision process","level":3,"score":0.2840000092983246},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C136979486","wikidata":"https://www.wikidata.org/wiki/Q773483","display_name":"Existential quantification","level":2,"score":0.27230000495910645},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.27160000801086426},{"id":"https://openalex.org/C52622258","wikidata":"https://www.wikidata.org/wiki/Q131222","display_name":"Information theory","level":2,"score":0.2653000056743622},{"id":"https://openalex.org/C73684929","wikidata":"https://www.wikidata.org/wiki/Q598870","display_name":"Lagrange multiplier","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C150325174","wikidata":"https://www.wikidata.org/wiki/Q4335500","display_name":"Optimal decision","level":3,"score":0.25440001487731934}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.04960","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04960","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.04960","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04960","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.5756425261497498}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement":[0],"learning":[1],"with":[2],"verifiable":[3],"rewards":[4,39],"(RLVR),":[5],"particularly":[6],"Group":[7],"Relative":[8],"Policy":[9],"Optimization":[10],"(GRPO),":[11],"has":[12],"advanced":[13],"LLM":[14],"reasoning.":[15],"However,":[16],"GRPO":[17,75,151],"suffers":[18],"from":[19,103],"three":[20],"credit":[21],"assignment":[22],"failures:":[23],"uniform":[24,32],"token-level":[25,112],"granularity":[26],"that":[27,34,45,79,122,142],"ignores":[28],"heterogeneous":[29],"informational":[30],"value,":[31],"polarity":[33,61],"penalizes":[35],"correct":[36],"steps":[37],"and":[38,42,63,118,147,152],"incorrect":[40],"ones,":[41],"zero-variance":[43],"collapse":[44],"erases":[46],"outcome-driven":[47],"gradients.":[48],"We":[49],"systematically":[50],"quantify":[51],"these":[52,69],"failures,":[53],"revealing":[54],"highly":[55],"non-uniform":[56],"token":[57],"informativeness,":[58],"widespread":[59],"step-level":[60],"misalignment,":[62],"substantial":[64],"training":[65],"waste.":[66],"To":[67],"address":[68],"limitations,":[70],"we":[71],"propose":[72],"Entropy-Progress":[73],"Aligned":[74],"(EP-GRPO),":[76],"a":[77],"framework":[78],"mines":[80],"the":[81],"model's":[82],"intrinsic":[83],"information":[84],"flow":[85,130],"for":[86,110],"dense,":[87],"self-supervised":[88],"guidance.":[89],"EP-GRPO":[90,143],"integrates":[91],"entropy-gated":[92],"modulation":[93],"to":[94,107,150],"prioritize":[95],"high":[96],"entropy":[97,120],"decision":[98],"pivots,":[99],"implicit":[100],"process":[101],"signals":[102],"policy":[104],"divergence":[105],"anchored":[106],"outcome":[108],"advantages":[109],"directional":[111],"feedback":[113],"without":[114],"external":[115],"reward":[116,133],"models,":[117],"cumulative":[119],"mapping":[121],"enables":[123],"progress-aligned":[124],"advantage":[125],"normalization,":[126],"naturally":[127],"maintaining":[128],"gradient":[129],"under":[131],"zero":[132],"variance.":[134],"Extensive":[135],"experiments":[136],"on":[137],"mathematical":[138],"reasoning":[139],"benchmarks":[140],"demonstrate":[141],"achieves":[144],"superior":[145],"accuracy":[146],"efficiency":[148],"compared":[149],"its":[153],"variants.":[154],"The":[155],"code":[156],"will":[157],"be":[158],"available.":[159]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-08T00:00:00"}
