{"id":"https://openalex.org/W7134058464","doi":"https://doi.org/10.48550/arxiv.2603.04597","title":"Bootstrapping Exploration with Group-Level Natural Language Feedback in Reinforcement Learning","display_name":"Bootstrapping Exploration with Group-Level Natural Language Feedback in Reinforcement Learning","publication_year":2026,"publication_date":"2026-03-04","ids":{"openalex":"https://openalex.org/W7134058464","doi":"https://doi.org/10.48550/arxiv.2603.04597"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.04597","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128256432","display_name":"Lei Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Lei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128235921","display_name":"Xiang Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Xiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128270164","display_name":"Chenxiao Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Chenxiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081220735","display_name":"Guobin Shen","orcid":"https://orcid.org/0000-0002-4069-2107"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Guobin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128255771","display_name":"Junjie Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Junjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128252519","display_name":"Xiaocheng Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Xiaocheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128226851","display_name":"Yuxuan Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Yuxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128249664","display_name":"Xing Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Xing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128274586","display_name":"Bing Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Bing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3046000003814697,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3046000003814697,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.19609999656677246,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.10329999774694443,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7929999828338623},{"id":"https://openalex.org/keywords/natural-language-understanding","display_name":"Natural language understanding","score":0.5557000041007996},{"id":"https://openalex.org/keywords/bootstrapping","display_name":"Bootstrapping (finance)","score":0.5465999841690063},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5310999751091003},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.508899986743927},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.46560001373291016},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.4474000036716461}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7929999828338623},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.76419997215271},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.5557000041007996},{"id":"https://openalex.org/C207609745","wikidata":"https://www.wikidata.org/wiki/Q4944086","display_name":"Bootstrapping (finance)","level":2,"score":0.5465999841690063},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5310999751091003},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.508899986743927},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4934000074863434},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.46560001373291016},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.4474000036716461},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.42820000648498535},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.35249999165534973},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.3467000126838684},{"id":"https://openalex.org/C186886427","wikidata":"https://www.wikidata.org/wiki/Q5441213","display_name":"Feedback loop","level":2,"score":0.32899999618530273},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.30149999260902405},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.288100004196167},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.2768999934196472},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.2621999979019165}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.04597","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.04597","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.04597","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.04597","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1,8,53],"models":[2],"(LLMs)":[3],"typically":[4],"receive":[5],"diverse":[6,88],"natural":[7],"(NL)":[9],"feedback":[10,33,54,66],"through":[11,59],"interaction":[12],"with":[13],"the":[14,28],"environment.":[15],"However,":[16],"current":[17],"reinforcement":[18],"learning":[19],"(RL)":[20],"algorithms":[21],"rely":[22],"solely":[23,164],"on":[24,138,165],"scalar":[25,166],"rewards,":[26],"leaving":[27],"rich":[29],"information":[30],"in":[31,113,156],"NL":[32],"underutilized":[34],"and":[35,78,87,121,141,150],"leading":[36],"to":[37,55,96,109,160],"inefficient":[38],"exploration.":[39],"In":[40],"this":[41],"work,":[42],"we":[43],"propose":[44,75],"GOLF,":[45],"an":[46],"RL":[47,126,161],"framework":[48],"that":[49,71,82,132,145],"explicitly":[50],"exploits":[51],"group-level":[52,92],"guide":[56],"targeted":[57,76,111],"exploration":[58,151],"actionable":[60],"refinements.":[61],"GOLF":[62,117,146],"aggregates":[63],"two":[64],"complementary":[65],"sources:":[67],"(i)":[68],"external":[69],"critiques":[70],"pinpoint":[72],"errors":[73],"or":[74],"fixes,":[77],"(ii)":[79],"intra-group":[80],"attempts":[81],"supply":[83],"alternative":[84],"partial":[85],"ideas":[86],"failure":[89],"patterns.":[90],"These":[91],"feedbacks":[93],"are":[94,101],"aggregated":[95],"produce":[97],"high-quality":[98],"refinements,":[99],"which":[100],"adaptively":[102],"injected":[103],"into":[104],"training":[105],"as":[106],"off-policy":[107],"scaffolds":[108],"provide":[110],"guidance":[112],"sparse-reward":[114],"regions.":[115],"Meanwhile,":[116],"jointly":[118],"optimizes":[119],"generation":[120],"refinement":[122],"within":[123],"a":[124,129],"unified":[125],"loop,":[127],"creating":[128],"virtuous":[130],"cycle":[131],"continuously":[133],"improves":[134],"both":[135,139],"capabilities.":[136],"Experiments":[137],"verifiable":[140],"non-verifiable":[142],"benchmarks":[143],"show":[144],"achieves":[147],"superior":[148],"performance":[149],"efficiency,":[152],"achieving":[153],"2.2$\\times$":[154],"improvements":[155],"sample":[157],"efficiency":[158],"compared":[159],"methods":[162],"trained":[163],"rewards.":[167],"Code":[168],"is":[169],"available":[170],"at":[171],"https://github.com/LuckyyySTA/GOLF.":[172]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-07T00:00:00"}
