{"id":"https://openalex.org/W7138896440","doi":"https://doi.org/10.48550/arxiv.2603.16600","title":"Rationale Matters: Learning Transferable Rubrics via Proxy-Guided Critique for VLM Reward Models","display_name":"Rationale Matters: Learning Transferable Rubrics via Proxy-Guided Critique for VLM Reward Models","publication_year":2026,"publication_date":"2026-03-17","ids":{"openalex":"https://openalex.org/W7138896440","doi":"https://doi.org/10.48550/arxiv.2603.16600"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.16600","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.16600","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.16600","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130139767","display_name":"Weijie Qiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Qiu, Weijie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130183062","display_name":"Dai Guan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Dai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129964583","display_name":"Junxin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Junxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101804803","display_name":"Zhihang Li","orcid":"https://orcid.org/0000-0002-9305-7924"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zhihang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057078698","display_name":"YongBo Gai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gai, Yongbo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129846004","display_name":"Mengyu Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Mengyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129811730","display_name":"Erchao Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Erchao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121672896","display_name":"Xiaoxi Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Xiaoxi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129890760","display_name":"Guanjun Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Guanjun","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5130139767"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8952999711036682,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8952999711036682,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.03720000013709068,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.012000000104308128,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.9876999855041504},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4131999909877777},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.3767000138759613},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.3756999969482422},{"id":"https://openalex.org/keywords/test","display_name":"Test (biology)","score":0.3012999892234802},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.2896000146865845}],"concepts":[{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.9876999855041504},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6621999740600586},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5169000029563904},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.46369999647140503},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4131999909877777},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.3767000138759613},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.3756999969482422},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32179999351501465},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.295199990272522},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.2896000146865845},{"id":"https://openalex.org/C202615002","wikidata":"https://www.wikidata.org/wiki/Q783507","display_name":"Differentiable function","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C2780148112","wikidata":"https://www.wikidata.org/wiki/Q1432581","display_name":"Proxy (statistics)","level":2,"score":0.2818000018596649},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.2574999928474426},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.2572000026702881},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.2565999925136566}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.16600","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.16600","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.16600","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.16600","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Generative":[0],"reward":[1,164,177],"models":[2,6],"(GRMs)":[3],"for":[4],"vision-language":[5],"(VLMs)":[7],"often":[8],"evaluate":[9],"outputs":[10],"via":[11],"a":[12,20,82,111,157],"three-stage":[13],"pipeline:":[14],"rubric":[15,26,60,69,84,102],"generation,":[16],"criterion-based":[17],"scoring,":[18],"and":[19,50,78,90,93,124,141,162],"final":[21],"verdict.":[22],"However,":[23],"the":[24,87,96,101,115,135,145,151,169],"intermediate":[25],"is":[27,156,187],"rarely":[28],"optimized":[29],"directly.":[30],"Prior":[31],"work":[32],"typically":[33],"either":[34],"treats":[35],"rubrics":[36,119,171],"as":[37,103,110],"incidental":[38],"or":[39],"relies":[40],"on":[41,134,148],"expensive":[42],"LLM-as-judge":[43],"checks":[44],"that":[45,80,120],"provide":[46],"no":[47],"differentiable":[48],"signal":[49],"limited":[51],"training-time":[52],"guidance.":[53],"We":[54],"propose":[55],"Proxy-GRM,":[56],"which":[57],"introduces":[58],"proxy-guided":[59],"verification":[61],"into":[62],"Reinforcement":[63],"Learning":[64],"(RL)":[65],"to":[66,117,173],"explicitly":[67],"enhance":[68],"quality.":[70],"Concretely,":[71],"we":[72],"train":[73],"lightweight":[74],"proxy":[75],"agents":[76],"(Proxy-SFT":[77],"Proxy-RL)":[79],"take":[81],"candidate":[83],"together":[85],"with":[86],"original":[88],"query":[89],"preference":[91,97],"pair,":[92],"then":[94],"predict":[95],"ordering":[98],"using":[99],"only":[100],"evidence.":[104],"The":[105],"proxy's":[106],"prediction":[107],"accuracy":[108,178],"serves":[109],"rubric-quality":[112],"reward,":[113],"incentivizing":[114],"model":[116],"produce":[118],"are":[121],"internally":[122],"consistent":[123],"transferable.":[125],"With":[126],"~50k":[127],"data":[128],"samples,":[129],"Proxy-GRM":[130],"reaches":[131],"state-of-the-art":[132],"results":[133],"VL-Reward":[136],"Bench,":[137,140,143],"Multimodal":[138],"Reward":[139],"MM-RLHF-Reward":[142],"outperforming":[144],"methods":[146],"trained":[147],"four":[149],"times":[150],"data.":[152],"Ablations":[153],"show":[154],"Proxy-SFT":[155],"stronger":[158],"verifier":[159],"than":[160],"Proxy-RL,":[161],"implicit":[163],"aggregation":[165],"performs":[166],"best.":[167],"Crucially,":[168],"learned":[170],"transfer":[172],"unseen":[174],"evaluators,":[175],"improving":[176],"at":[179,189],"test":[180],"time":[181],"without":[182],"additional":[183],"training.":[184],"Our":[185],"code":[186],"available":[188],"https://github.com/Qwen-Applications/Proxy-GRM.":[190]},"counts_by_year":[],"updated_date":"2026-03-20T20:54:20.808490","created_date":"2026-03-20T00:00:00"}
