{"id":"https://openalex.org/W7155413322","doi":"https://doi.org/10.48550/arxiv.2604.20659","title":"GRPO-VPS: Enhancing Group Relative Policy Optimization with Verifiable Process Supervision for Effective Reasoning","display_name":"GRPO-VPS: Enhancing Group Relative Policy Optimization with Verifiable Process Supervision for Effective Reasoning","publication_year":2026,"publication_date":"2026-04-22","ids":{"openalex":"https://openalex.org/W7155413322","doi":"https://doi.org/10.48550/arxiv.2604.20659"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.20659","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20659","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.20659","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134411223","display_name":"Jingyi Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jingyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134433968","display_name":"Lei Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Lei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057294035","display_name":"Tengjin Weng","orcid":"https://orcid.org/0009-0006-9572-2576"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weng, Tengjin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124077016","display_name":"Song-Li Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Song-Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134401789","display_name":"Haochen Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Haochen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134374781","display_name":"Jierun Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jierun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134383127","display_name":"Chaofan Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Chaofan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134424705","display_name":"Haoli Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Haoli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134438783","display_name":"Lu Hou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hou, Lu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134455435","display_name":"Lifeng Shang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shang, Lifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134454389","display_name":"Xiao-Ping Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xiao-Ping","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2939000129699707,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2939000129699707,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.1395999938249588,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.13130000233650208,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.8324999809265137},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.6503000259399414},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.5209000110626221},{"id":"https://openalex.org/keywords/outcome","display_name":"Outcome (game theory)","score":0.4982999861240387},{"id":"https://openalex.org/keywords/group","display_name":"Group (periodic table)","score":0.3799999952316284},{"id":"https://openalex.org/keywords/tracking","display_name":"Tracking (education)","score":0.3199000060558319}],"concepts":[{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.8324999809265137},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6815999746322632},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.6503000259399414},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.5209000110626221},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.4982999861240387},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46799999475479126},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.41350001096725464},{"id":"https://openalex.org/C2781311116","wikidata":"https://www.wikidata.org/wiki/Q83306","display_name":"Group (periodic table)","level":2,"score":0.3799999952316284},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.3199000060558319},{"id":"https://openalex.org/C87833898","wikidata":"https://www.wikidata.org/wiki/Q1060280","display_name":"Advanced driver assistance systems","level":2,"score":0.30489999055862427},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.2874000072479248},{"id":"https://openalex.org/C110251889","wikidata":"https://www.wikidata.org/wiki/Q1569697","display_name":"Model checking","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.20659","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20659","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.20659","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20659","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.41203057765960693}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement":[0],"Learning":[1],"with":[2],"Verifiable":[3],"Rewards":[4],"(RLVR)":[5],"has":[6],"advanced":[7],"the":[8,36,75,79,88,95,99,130],"reasoning":[9,57,84],"capabilities":[10],"of":[11,22,98],"Large":[12],"Language":[13],"Models":[14],"(LLMs)":[15],"by":[16],"leveraging":[17],"direct":[18],"outcome":[19],"verification":[20],"instead":[21],"learned":[23],"reward":[24],"models.":[25,143],"Building":[26],"on":[27,145,167,177],"this":[28,63],"paradigm,":[29],"Group":[30],"Relative":[31],"Policy":[32],"Optimization":[33],"(GRPO)":[34],"eliminates":[35],"need":[37,131],"for":[38,47,132],"critic":[39],"models":[40],"but":[41],"suffers":[42],"from":[43,136],"indiscriminate":[44],"credit":[45],"assignment":[46],"intermediate":[48,133],"steps,":[49],"which":[50],"limits":[51],"its":[52,83],"ability":[53],"to":[54,114,159,172],"identify":[55],"effective":[56],"strategies":[58],"and":[59,69,93,124,147,163,170,175],"incurs":[60],"overthinking.":[61],"In":[62],"work,":[64],"we":[65,107],"introduce":[66],"a":[67],"model-free":[68],"verifiable":[70],"process":[71],"supervision":[72,134],"via":[73],"probing":[74],"model's":[76],"belief":[77],"in":[78],"correct":[80,100],"answer":[81,101],"throughout":[82],"trajectory.":[85],"By":[86],"segmenting":[87],"generation":[89],"into":[90],"discrete":[91],"steps":[92],"tracking":[94],"conditional":[96],"probability":[97],"appended":[102],"at":[103],"each":[104],"segment":[105],"boundary,":[106],"efficiently":[108],"compute":[109],"interpretable":[110],"segment-wise":[111],"progress":[112],"measurements":[113],"refine":[115],"GRPO's":[116],"trajectory-level":[117],"feedback.":[118],"This":[119],"approach":[120],"enables":[121],"more":[122],"targeted":[123],"sample-efficient":[125],"policy":[126],"updates,":[127],"while":[128],"avoiding":[129],"derived":[135],"costly":[137],"Monte":[138],"Carlo":[139],"rollouts":[140],"or":[141],"auxiliary":[142],"Experiments":[144],"mathematical":[146],"general-domain":[148,178],"benchmarks":[149],"show":[150],"consistent":[151],"gains":[152],"over":[153],"GRPO":[154],"across":[155],"diverse":[156],"models:":[157],"up":[158,171],"2.6-point":[160],"accuracy":[161],"improvements":[162],"13.7%":[164],"reasoning-length":[165],"reductions":[166],"math":[168],"tasks,":[169,179],"2.4":[173],"points":[174],"4%":[176],"demonstrating":[180],"strong":[181],"generalization.":[182]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-24T00:00:00"}
