{"id":"https://openalex.org/W7133342563","doi":"https://doi.org/10.48550/arxiv.2603.00724","title":"RLAR: An Agentic Reward System for Multi-task Reinforcement Learning on Large Language Models","display_name":"RLAR: An Agentic Reward System for Multi-task Reinforcement Learning on Large Language Models","publication_year":2026,"publication_date":"2026-02-28","ids":{"openalex":"https://openalex.org/W7133342563","doi":"https://doi.org/10.48550/arxiv.2603.00724"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00724","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00724","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00724","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127948511","display_name":"Andrew Zhuoer Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Feng, Andrew Zhuoer","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127966976","display_name":"Cunxiang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Cunxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127941245","display_name":"Bosi Wen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen, Bosi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127892038","display_name":"Yidong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yidong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128023128","display_name":"Yu Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128003481","display_name":"Hongning Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hongning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128003039","display_name":"Minlie Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Minlie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5127948511"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.6018000245094299,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.6018000245094299,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.06279999762773514,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.04830000177025795,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.7786999940872192},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7649999856948853},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5827000141143799},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.5758000016212463},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.48190000653266907},{"id":"https://openalex.org/keywords/temporal-difference-learning","display_name":"Temporal difference learning","score":0.40709999203681946}],"concepts":[{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.7786999940872192},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7649999856948853},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6905999779701233},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5827000141143799},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.5758000016212463},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.565500020980835},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.48190000653266907},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4408000111579895},{"id":"https://openalex.org/C196340769","wikidata":"https://www.wikidata.org/wiki/Q7698910","display_name":"Temporal difference learning","level":3,"score":0.40709999203681946},{"id":"https://openalex.org/C143661069","wikidata":"https://www.wikidata.org/wiki/Q670713","display_name":"Reward system","level":2,"score":0.3971000015735626},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.3269999921321869},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.31130000948905945},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.25540000200271606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00724","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00724","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00724","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00724","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"model":[2],"alignment":[3],"via":[4],"reinforcement":[5],"learning":[6],"depends":[7],"critically":[8],"on":[9,148],"reward":[10,16,49,57,75,90,140],"function":[11],"quality.":[12],"However,":[13],"static,":[14],"domain-specific":[15],"models":[17,76],"are":[18,146],"often":[19],"costly":[20],"to":[21,51,71,92,113],"train":[22],"and":[23,64,80,119,129,144],"exhibit":[24],"poor":[25],"generalization":[26,137],"in":[27],"out-of-distribution":[28],"scenarios":[29],"encountered":[30],"during":[31,99],"RL":[32],"iterations.":[33],"We":[34],"present":[35],"RLAR":[36,55,105,124],"(Reinforcement":[37],"Learning":[38],"from":[39,77,111],"Agent":[40],"Rewards),":[41],"an":[42],"agent-driven":[43],"framework":[44],"that":[45,104],"dynamically":[46],"assigns":[47],"tailored":[48],"functions":[50],"individual":[52],"queries.":[53],"Specifically,":[54],"transforms":[56],"acquisition":[58],"into":[59],"a":[60],"dynamic":[61,139],"tool":[62],"synthesis":[63],"invocation":[65],"task.":[66],"It":[67],"leverages":[68],"LLM":[69],"agents":[70],"autonomously":[72],"retrieve":[73],"optimal":[74],"the":[78,89,95,131],"Internet":[79],"synthesize":[81],"programmatic":[82],"verifiers":[83],"through":[84,138],"code":[85,145],"generation.":[86],"This":[87],"allows":[88],"system":[91],"self-evolve":[93],"with":[94],"shifting":[96],"data":[97,143],"distributions":[98],"training.":[100],"Experimental":[101],"results":[102],"demonstrate":[103],"yields":[106],"consistent":[107],"performance":[108,132],"gains":[109],"ranging":[110],"10":[112],"60":[114],"across":[115],"mathematics,":[116],"coding,":[117],"translation,":[118],"dialogue":[120],"tasks.":[121],"On":[122],"RewardBench-V2,":[123],"significantly":[125],"outperforms":[126],"static":[127],"baselines":[128],"approaches":[130],"upper":[133],"bound,":[134],"demonstrating":[135],"superior":[136],"orchestration.":[141],"The":[142],"available":[147],"this":[149],"link:":[150],"https://github.com/ZhuoerFeng/RLAR.":[151]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
