{"id":"https://openalex.org/W7140105616","doi":"https://doi.org/10.18653/v1/2026.findings-eacl.328","title":"Turn-PPO: Turn-Level Advantage Estimation with PPO for Improved Multi-Turn RL in Agentic LLMs","display_name":"Turn-PPO: Turn-Level Advantage Estimation with PPO for Improved Multi-Turn RL in Agentic LLMs","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7140105616","doi":"https://doi.org/10.18653/v1/2026.findings-eacl.328"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2026.findings-eacl.328","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2026.findings-eacl.328","pdf_url":"https://aclanthology.org/2026.findings-eacl.328.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EACL 2026","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2026.findings-eacl.328.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130349110","display_name":"Junbo Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junbo Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130359032","display_name":"Peng Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng Zhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130368631","display_name":"Rui Meng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rui Meng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130347319","display_name":"Meet P. Vadera","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meet P. Vadera","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130333778","display_name":"Lihong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lihong Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130399921","display_name":"Yang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.37683525,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"6227","last_page":"6243"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11522","display_name":"VLSI and FPGA Design Techniques","score":0.10689999908208847,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11522","display_name":"VLSI and FPGA Design Techniques","score":0.10689999908208847,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10100","display_name":"Metaheuristic Optimization Algorithms Research","score":0.03669999912381172,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.0340999998152256,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/estimation","display_name":"Estimation","score":0.48510000109672546},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.31450000405311584},{"id":"https://openalex.org/keywords/production","display_name":"Production (economics)","score":0.2955000102519989},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.27799999713897705}],"concepts":[{"id":"https://openalex.org/C96250715","wikidata":"https://www.wikidata.org/wiki/Q965330","display_name":"Estimation","level":2,"score":0.48510000109672546},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4715000092983246},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.4456000030040741},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.4440999925136566},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.3508000075817108},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.31450000405311584},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.2955000102519989},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2892000079154968},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C162118730","wikidata":"https://www.wikidata.org/wiki/Q1128453","display_name":"Actuarial science","level":1,"score":0.2743000090122223},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.2533000111579895}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2026.findings-eacl.328","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2026.findings-eacl.328","pdf_url":"https://aclanthology.org/2026.findings-eacl.328.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EACL 2026","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2026.findings-eacl.328","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2026.findings-eacl.328","pdf_url":"https://aclanthology.org/2026.findings-eacl.328.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EACL 2026","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7140105616.pdf","grobid_xml":"https://content.openalex.org/works/W7140105616.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement":[0],"learning":[1],"(RL)":[2],"has":[3],"re-emerged":[4],"as":[5,62,92],"a":[6,83,88],"natural":[7],"approach":[8],"for":[9,53],"training":[10],"interactive":[11],"LLM":[12],"agents":[13],"in":[14,35,77],"real-world":[15],"environments.However,":[16],"directly":[17],"applying":[18],"the":[19,95,102,108],"widely":[20],"used":[21,97],"Group":[22],"Relative":[23],"Policy":[24,59],"Optimization":[25,60],"(GRPO)":[26],"algorithm":[27],"to":[28,68,94],"multi-turn":[29,54,78],"tasks":[30],"exposes":[31],"notable":[32],"limitations,":[33],"particularly":[34],"scenarios":[36],"requiring":[37],"long-horizon":[38],"reasoning.To":[39],"address":[40],"these":[41],"challenges,":[42],"we":[43,80],"investigate":[44],"more":[45,70],"stable":[46],"and":[47,65,104,114],"effective":[48],"advantage":[49],"estimation":[50],"strategies,":[51],"especially":[52],"settings.We":[55],"first":[56],"explore":[57],"Proximal":[58],"(PPO)":[61],"an":[63],"alternative":[64],"find":[66],"it":[67],"be":[69],"robust":[71],"than":[72],"GRPO.To":[73],"further":[74],"enhance":[75],"PPO":[76],"scenarios,":[79],"introduce":[81],"turn-PPO,":[82,111],"variant":[84],"that":[85],"operates":[86],"on":[87,101],"turn-level":[89],"MDP":[90],"formulation,":[91],"opposed":[93],"commonly":[96],"token-level":[98],"MDP.Our":[99],"results":[100],"WebShop":[103],"Sokoban":[105],"datasets":[106],"demonstrate":[107],"effectiveness":[109],"of":[110],"both":[112],"with":[113],"without":[115],"long":[116],"reasoning":[117],"components.":[118]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-24T00:00:00"}
