{"id":"https://openalex.org/W7140233304","doi":"https://doi.org/10.48550/arxiv.2603.21383","title":"PivotRL: High Accuracy Agentic Post-Training at Low Compute Cost","display_name":"PivotRL: High Accuracy Agentic Post-Training at Low Compute Cost","publication_year":2026,"publication_date":"2026-03-22","ids":{"openalex":"https://openalex.org/W7140233304","doi":"https://doi.org/10.48550/arxiv.2603.21383"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.21383","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21383","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.21383","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yi, Junkeun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi, Junkeun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Mosk-Aoyama, Damon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mosk-Aoyama, Damon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Huang, Baihe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Baihe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gala, Ritu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gala, Ritu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Charles","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Charles","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Devare, Sugam Dipak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Devare, Sugam Dipak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Bhardwaj, Khushi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhardwaj, Khushi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gupta, Abhibha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gupta, Abhibha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Kuchaiev, Oleksii","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kuchaiev, Oleksii","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Jiao, Jiantao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiao, Jiantao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Jian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Srinivasan, Venkat","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Srinivasan, Venkat","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.8859999775886536,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.8859999775886536,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.018799999728798866,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.018300000578165054,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5666000247001648},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.5460000038146973},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.5016999840736389},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.49880000948905945},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.42309999465942383}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6437000036239624},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5666000247001648},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.5460000038146973},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.5016999840736389},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.49880000948905945},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.42309999465942383},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3910999894142151},{"id":"https://openalex.org/C157486923","wikidata":"https://www.wikidata.org/wiki/Q1376436","display_name":"String (physics)","level":2,"score":0.3855000138282776},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38440001010894775},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.30660000443458557},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.29109999537467957},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25619998574256897},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.25380000472068787}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.21383","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21383","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.21383","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21383","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Post-training":[0],"for":[1,88,105],"long-horizon":[2],"agentic":[3,167,179,206],"tasks":[4],"has":[5],"a":[6,51],"tension":[7],"between":[8],"compute":[9,18,39,63],"efficiency":[10,64],"and":[11,86,169],"generalization.":[12],"While":[13],"supervised":[14],"fine-tuning":[15],"(SFT)":[16],"is":[17,195],"efficient,":[19],"it":[20,81,102],"often":[21],"suffers":[22],"from":[23],"out-of-domain":[24],"(OOD)":[25],"degradation.":[26],"Conversely,":[27],"end-to-end":[28],"reinforcement":[29],"learning":[30,127],"(E2E":[31],"RL)":[32],"preserves":[33],"OOD":[34,69,172],"capabilities,":[35],"but":[36],"incurs":[37],"high":[38,97,130],"costs":[40],"due":[41],"to":[42,60,143,148],"many":[43],"turns":[44,92],"of":[45,65,71],"on-policy":[46,84],"rollout.":[47],"We":[48,119],"introduce":[49],"PivotRL,":[50],"novel":[52],"framework":[53],"that":[54,122,156],"operates":[55],"on":[56,76,140,151,163,178],"existing":[57],"SFT":[58,66,116,150],"trajectories":[59],"combine":[61],"the":[62,68,115,202],"with":[67,114,129,186,189],"accuracy":[70,162,173,185],"E2E":[72,187],"RL.":[73],"PivotRL":[74,157,182,194],"relies":[75],"two":[77],"key":[78],"mechanisms:":[79],"first,":[80],"executes":[82],"local,":[83],"rollouts":[85],"filters":[87],"pivots:":[89],"informative":[90],"intermediate":[91],"where":[93],"sampled":[94],"actions":[95,107,141],"exhibit":[96],"variance":[98],"in":[99,174,204],"outcomes;":[100],"second,":[101],"utilizes":[103],"rewards":[104],"functional-equivalent":[106],"rather":[108],"than":[109],"demanding":[110],"strict":[111],"string":[112],"matching":[113],"data":[117],"demonstration.":[118],"theoretically":[120],"show":[121],"these":[123],"mechanisms":[124],"incentivize":[125],"strong":[126],"signals":[128],"natural":[131],"gradient":[132],"norm,":[133],"while":[134],"maximally":[135],"preserving":[136],"policy":[137],"probability":[138],"ordering":[139],"unrelated":[142],"training":[144],"tasks.":[145,176],"In":[146],"comparison":[147],"standard":[149],"identical":[152],"data,":[153],"we":[154],"demonstrate":[155],"achieves":[158,183],"+4.17%":[159],"higher":[160,171],"in-domain":[161],"average":[164],"across":[165],"four":[166],"domains,":[168],"+10.04%":[170],"non-agentic":[175],"Notably,":[177],"coding":[180],"tasks,":[181],"competitive":[184],"RL":[188],"4x":[190],"fewer":[191],"rollout":[192],"turns.":[193],"adopted":[196],"by":[197],"NVIDIA's":[198],"Nemotron-3-Super-120B-A12B,":[199],"acting":[200],"as":[201],"workhorse":[203],"production-scale":[205],"post-training.":[207]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-25T00:00:00"}
