{"id":"https://openalex.org/W7126231948","doi":"https://doi.org/10.48550/arxiv.2601.21268","title":"Reinforcement Learning from Meta-Evaluation: Aligning Language Models Without Ground-Truth Labels","display_name":"Reinforcement Learning from Meta-Evaluation: Aligning Language Models Without Ground-Truth Labels","publication_year":2026,"publication_date":"2026-01-29","ids":{"openalex":"https://openalex.org/W7126231948","doi":"https://doi.org/10.48550/arxiv.2601.21268"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.21268","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124358134","display_name":"Micah Rentschler","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rentschler, Micah","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124436640","display_name":"Jesse Roberts","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roberts, Jesse","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.4821000099182129,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.4821000099182129,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.06069999933242798,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.04580000042915344,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7854999899864197},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.6726999878883362},{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.626800000667572},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5202000141143799},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4921000003814697},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.48080000281333923},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.47350001335144043},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.35690000653266907}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7854999899864197},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7364000082015991},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.675000011920929},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.6726999878883362},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.626800000667572},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5202000141143799},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5001000165939331},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4921000003814697},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.48080000281333923},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.47350001335144043},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38370001316070557},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.35690000653266907},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.35109999775886536},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.3450999855995178},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.3239000141620636},{"id":"https://openalex.org/C74672266","wikidata":"https://www.wikidata.org/wiki/Q815859","display_name":"Language acquisition","level":2,"score":0.30250000953674316},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.2809999883174896},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.27630001306533813},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.27090001106262207},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.26260000467300415},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C94922259","wikidata":"https://www.wikidata.org/wiki/Q33215","display_name":"Constructed language","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.25920000672340393}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.21268","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.21268","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.21268","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.21268","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Most":[0],"reinforcement":[1],"learning":[2,80],"(RL)":[3],"methods":[4],"for":[5],"training":[6],"large":[7],"language":[8],"models":[9,108],"(LLMs)":[10],"require":[11],"ground-truth":[12,123],"labels":[13,124],"or":[14,23,53],"task-specific":[15],"verifiers,":[16],"limiting":[17],"scalability":[18],"when":[19],"correctness":[20],"is":[21],"ambiguous":[22],"expensive":[24],"to":[25,45,98,119],"obtain.":[26],"We":[27],"introduce":[28],"Reinforcement":[29],"Learning":[30],"from":[31,41],"Meta-Evaluation":[32],"(RLME),":[33],"which":[34,131],"optimizes":[35],"a":[36,65,69,84],"generator":[37,74],"using":[38],"reward":[39,70],"derived":[40],"an":[42],"evaluator's":[43,62],"answers":[44],"natural-language":[46],"meta-questions":[47],"(e.g.,":[48],"\"Is":[49,54],"the":[50,55,61,73,128],"answer":[51],"correct?\"":[52],"reasoning":[56,111],"logically":[57],"consistent?\").":[58],"RLME":[59,91],"treats":[60],"probability":[63],"of":[64,86],"positive":[66],"judgment":[67],"as":[68],"and":[71,94,117],"updates":[72],"via":[75],"group-relative":[76],"policy":[77],"optimization,":[78],"enabling":[79],"without":[81],"labels.":[82],"Across":[83],"suite":[85],"experiments,":[87],"we":[88],"show":[89],"that":[90],"achieves":[92],"accuracy":[93],"sample":[95],"efficiency":[96],"comparable":[97],"label-based":[99],"training,":[100],"enables":[101],"controllable":[102],"trade-offs":[103],"among":[104],"multiple":[105],"objectives,":[106],"steers":[107],"toward":[109],"reliable":[110],"patterns":[112],"rather":[113],"than":[114],"post-hoc":[115],"rationalization,":[116],"generalizes":[118],"open-domain":[120],"settings":[121],"where":[122],"are":[125],"unavailable,":[126],"broadening":[127],"domains":[129],"in":[130],"LLMs":[132],"may":[133],"be":[134],"trained":[135],"with":[136],"RL.":[137]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-01T00:00:00"}
