{"id":"https://openalex.org/W7162494031","doi":"https://doi.org/10.48550/arxiv.2605.26952","title":"Efficient Agentic Reinforcement Learning with On-Policy Intrinsic Knowledge Boundary Enhancement","display_name":"Efficient Agentic Reinforcement Learning with On-Policy Intrinsic Knowledge Boundary Enhancement","publication_year":2026,"publication_date":"2026-05-26","ids":{"openalex":"https://openalex.org/W7162494031","doi":"https://doi.org/10.48550/arxiv.2605.26952"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.26952","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26952","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.26952","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111333870","display_name":"Dingwei Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Dingwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137155580","display_name":"Zefang Zong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zong, Zefang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137096675","display_name":"Zhipeng Ma","orcid":"https://orcid.org/0000-0002-4049-539X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Zhipeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137191362","display_name":"Leo Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Leo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137110237","display_name":"Yang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137144658","display_name":"Chengming Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chengming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137099265","display_name":"Peng Chen","orcid":"https://orcid.org/0000-0002-3691-9719"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Peng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137129769","display_name":"Jie Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Jie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.8396999835968018,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.8396999835968018,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.021299999207258224,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.01080000028014183,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7261999845504761},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.6146000027656555},{"id":"https://openalex.org/keywords/boundary","display_name":"Boundary (topology)","score":0.4259999990463257},{"id":"https://openalex.org/keywords/parametric-statistics","display_name":"Parametric statistics","score":0.37450000643730164},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.35670000314712524},{"id":"https://openalex.org/keywords/learning-classifier-system","display_name":"Learning classifier system","score":0.3255000114440918},{"id":"https://openalex.org/keywords/adaptability","display_name":"Adaptability","score":0.3248000144958496},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.32339999079704285}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7261999845504761},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7107999920845032},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.6146000027656555},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5009999871253967},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4519999921321869},{"id":"https://openalex.org/C62354387","wikidata":"https://www.wikidata.org/wiki/Q875399","display_name":"Boundary (topology)","level":2,"score":0.4259999990463257},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.37450000643730164},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.35670000314712524},{"id":"https://openalex.org/C199190896","wikidata":"https://www.wikidata.org/wiki/Q3509276","display_name":"Learning classifier system","level":3,"score":0.3255000114440918},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.3248000144958496},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.32339999079704285},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3192000091075897},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2962999939918518},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C42023084","wikidata":"https://www.wikidata.org/wiki/Q5249231","display_name":"Decision boundary","level":3,"score":0.29120001196861267},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.2858999967575073},{"id":"https://openalex.org/C2777220311","wikidata":"https://www.wikidata.org/wiki/Q6423340","display_name":"Knowledge acquisition","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2702000141143799},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.2653000056743622},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C115925183","wikidata":"https://www.wikidata.org/wiki/Q1412694","display_name":"Knowledge-based systems","level":2,"score":0.2535000145435333}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.26952","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26952","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.26952","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26952","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.4445643723011017,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Agentic":[0],"reinforcement":[1],"learning":[2],"(RL)":[3],"has":[4],"proven":[5],"effective":[6],"for":[7,137],"training":[8,21,149],"LLM-based":[9],"agents":[10],"with":[11],"external":[12],"tool-use":[13,135],"capabilities.":[14],"However,":[15],"we":[16,73],"identify":[17],"that":[18,59,83,132,157],"agentic":[19,147,174],"RL":[20,148,193],"induces":[22],"increasing":[23],"redundant":[24],"tool":[25,116,168,179],"calls":[26,117,169],"and":[27,94,113,127,166,195],"blurs":[28],"the":[29,35,86,101,105,114,146,196],"model's":[30,87],"intrinsic":[31,88],"knowledge":[32,47,89,102],"boundary,":[33],"where":[34],"model":[36],"fails":[37],"to":[38,61,67],"distinguish":[39],"when":[40,45],"tools":[41,110],"are":[42,111,142],"needed":[43],"versus":[44],"parametric":[46],"suffices.":[48],"Existing":[49],"solutions":[50],"based":[51],"on":[52,152,164],"reward":[53,68],"shaping":[54],"create":[55],"coarse-grained":[56],"optimization":[57],"targets":[58],"tend":[60],"incentivize":[62],"indiscriminate":[63],"tool-call":[64],"suppression,":[65],"leading":[66],"hacking.":[69],"In":[70],"this":[71],"paper,":[72],"propose":[74],"AKBE":[75,124,158],"(Agentic":[76],"Knowledge":[77],"Boundary":[78],"Enhancement),":[79],"an":[80],"on-policy":[81],"method":[82],"dynamically":[84],"probes":[85],"boundary":[90,103],"through":[91],"dual-path":[92],"(with-tool":[93],"no-tool)":[95],"rollouts":[96],"during":[97],"training.":[98],"We":[99],"define":[100],"as":[104],"per-instance":[106],"determination":[107],"of":[108,198],"whether":[109],"required":[112],"minimum":[115],"necessary.":[118],"By":[119],"comparing":[120],"correctness":[121],"across":[122,191],"paths,":[123],"categorizes":[125],"trajectories":[126],"constructs":[128],"targeted":[129],"supervisory":[130],"signals":[131,141],"guide":[133],"efficient":[134],"patterns":[136],"each":[138,199],"question.":[139],"These":[140],"integrated":[143],"seamlessly":[144],"into":[145],"loop.":[150],"Experiments":[151],"seven":[153],"QA":[154],"benchmarks":[155],"demonstrate":[156],"improves":[159],"task":[160],"accuracy":[161],"by":[162,170],"+1.85":[163],"average":[165],"reduces":[167],"18%":[171],"over":[172],"standard":[173],"RL,":[175],"yielding":[176],"25%":[177],"higher":[178],"productivity":[180],"without":[181],"any":[182],"accuracy-efficiency":[183],"trade-off.":[184],"Further":[185],"analysis":[186],"suggests":[187],"its":[188],"plug-and-play":[189],"compatibility":[190],"different":[192],"algorithms":[194],"mechanism":[197],"signal":[200],"category.":[201],"Our":[202],"code":[203],"is":[204],"available":[205],"at":[206],"https://github.com/CuSO4-Chen/AKBE.":[207]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-28T00:00:00"}
