{"id":"https://openalex.org/W7162777801","doi":"https://doi.org/10.48550/arxiv.2605.30244","title":"Reinforcement Learning with Robust Rubric Rewards","display_name":"Reinforcement Learning with Robust Rubric Rewards","publication_year":2026,"publication_date":"2026-05-28","ids":{"openalex":"https://openalex.org/W7162777801","doi":"https://doi.org/10.48550/arxiv.2605.30244"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.30244","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.30244","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.30244","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100544606","display_name":"Yaqi Yu","orcid":"https://orcid.org/0009-0006-2700-8389"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Ya-Qi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137364380","display_name":"Hao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133631167","display_name":"Fangyu Hong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hong, Fangyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133666825","display_name":"Xiangyang Qu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qu, Xiangyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137331111","display_name":"Gaojie Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Gaojie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133645240","display_name":"Qiaoyu Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Qiaoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137315342","display_name":"Nuo Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Nuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137375993","display_name":"Huixin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Huixin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066336097","display_name":"Wuheng Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Wuheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003873307","display_name":"Yongxin Liao","orcid":"https://orcid.org/0000-0001-5379-1481"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, Yongxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137352781","display_name":"Zihao Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zihao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137387451","display_name":"Haonan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haonan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137351313","display_name":"Ziming Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Ziming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137357294","display_name":"Dezhi Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Dezhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137311128","display_name":"Minghui Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, Minghui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137367189","display_name":"Jihao Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Jihao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032063939","display_name":"Haoyu Ren","orcid":"https://orcid.org/0000-0002-0241-6507"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Haoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137355953","display_name":"Dandan Tu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tu, Dandan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":18,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6575000286102295,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6575000286102295,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.054499998688697815,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0502999983727932,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.8855999708175659},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6909000277519226},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.5454000234603882},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.482699990272522},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4309999942779541},{"id":"https://openalex.org/keywords/base","display_name":"Base (topology)","score":0.3723999857902527}],"concepts":[{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.8855999708175659},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7057999968528748},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6909000277519226},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5550000071525574},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.5454000234603882},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.482699990272522},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.48170000314712524},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4309999942779541},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.3723999857902527},{"id":"https://openalex.org/C49895821","wikidata":"https://www.wikidata.org/wiki/Q5227368","display_name":"Data verification","level":2,"score":0.3409000039100647},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.28630000352859497},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.26739999651908875}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.30244","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.30244","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.30244","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.30244","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7218821048736572,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"Reinforcement":[1,51],"Learning":[2,52],"with":[3,53,77],"Verifiable":[4],"Rewards":[5,56],"(RLVR)":[6],"is":[7],"effective":[8],"for":[9,34,84],"deterministically":[10],"checkable":[11],"tasks,":[12],"many":[13],"vision-language":[14],"tasks":[15],"are":[16],"partially":[17],"verifiable,":[18],"demanding":[19],"multi-criteria":[20],"supervision":[21],"(e.g.,":[22],"perceptual":[23],"details,":[24],"reasoning":[25],"steps,":[26],"and":[27,103,119,144,157],"constraints).":[28],"Rubrics":[29],"provide":[30],"a":[31,78,93,137],"natural":[32],"interface":[33],"this":[35],"fine-grained":[36],"supervision,":[37],"but":[38],"their":[39],"effectiveness":[40],"depends":[41],"on":[42,127],"the":[43,141,146],"execution":[44,72],"accuracy":[45],"during":[46],"online":[47],"RL.":[48],"We":[49],"propose":[50],"Robust":[54],"Rubric":[55],"($\\text{RLR}^3$),":[57],"extending":[58],"RLVR":[59],"from":[60,101,105],"task-level":[61],"verification":[62,156],"to":[63,112],"criterion-level":[64],"verification.":[65],"$\\text{RLR}^3$":[66,91,108,132],"routes":[67],"instance-specific":[68],"rubrics":[69],"through":[70],"two":[71],"paths:":[73],"an":[74,82],"LLM-as-an-extractor":[75],"paired":[76],"deterministic":[79,155],"verifier,":[80],"or":[81],"LLM-as-a-Judge":[83],"non-verifiable":[85],"criteria.":[86],"To":[87],"ensure":[88],"faithful":[89],"scoring,":[90],"introduce":[92],"minimal":[94,158],"exposure":[95,159],"strategy":[96],"that":[97],"masks":[98],"ground":[99],"truths":[100],"extractors":[102],"images":[104],"judges.":[106],"Furthermore,":[107],"employs":[109],"hierarchical":[110],"aggregation":[111],"prioritize":[113],"essential":[114],"criteria":[115],"over":[116,140],"additional":[117],"criteria,":[118],"mitigates":[120],"score":[121],"saturation":[122],"within":[123],"rollout":[124],"groups.":[125],"Evaluated":[126],"Qwen3-VL-30B-A3B":[128],"across":[129],"15":[130],"benchmarks,":[131],"consistently":[133],"outperforms":[134],"RLVR,":[135],"yielding":[136],"4.7-point":[138],"improvement":[139],"base":[142],"model":[143,149],"exceeding":[145],"official":[147],"instruct-to-thinking":[148],"gap.":[150],"Controlled":[151],"audits":[152],"confirm":[153],"our":[154],"significantly":[160],"reduce":[161],"exploitable":[162],"false":[163],"positives.":[164]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-30T00:00:00"}
