{"id":"https://openalex.org/W7161044001","doi":"https://doi.org/10.48550/arxiv.2605.12070","title":"Missing Old Logits in Asynchronous Agentic RL: Semantic Mismatch and Repair Methods for Off-Policy Correction","display_name":"Missing Old Logits in Asynchronous Agentic RL: Semantic Mismatch and Repair Methods for Off-Policy Correction","publication_year":2026,"publication_date":"2026-05-12","ids":{"openalex":"https://openalex.org/W7161044001","doi":"https://doi.org/10.48550/arxiv.2605.12070"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.12070","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.12070","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.12070","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125195887","display_name":"Zhong Guan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Zhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136062549","display_name":"Yongjian Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Yongjian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136081479","display_name":"Haoran Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Haoran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136040221","display_name":"Wen Huang","orcid":"https://orcid.org/0009-0001-7302-9096"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Wen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041877213","display_name":"Shuai Di","orcid":"https://orcid.org/0000-0001-7466-9709"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Di, Shuai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109988314","display_name":"Xiong Jun Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Likang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136001855","display_name":"Likang Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Xiong Jun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136073344","display_name":"Hongke Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Hongke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2483000010251999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2483000010251999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.07810000330209732,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.05730000138282776,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.847100019454956},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.5181000232696533},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5083000063896179},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.46369999647140503},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.42579999566078186},{"id":"https://openalex.org/keywords/decoupling","display_name":"Decoupling (probability)","score":0.42179998755455017},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.38600000739097595},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.3693000078201294}],"concepts":[{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.847100019454956},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7958999872207642},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.5181000232696533},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5083000063896179},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.46369999647140503},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.42579999566078186},{"id":"https://openalex.org/C205606062","wikidata":"https://www.wikidata.org/wiki/Q5249645","display_name":"Decoupling (probability)","level":2,"score":0.42179998755455017},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.38600000739097595},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.3693000078201294},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.36390000581741333},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.3614000082015991},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3564999997615814},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.32580000162124634},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.31150001287460327},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.30070000886917114},{"id":"https://openalex.org/C2776848632","wikidata":"https://www.wikidata.org/wiki/Q853463","display_name":"Clipping (morphology)","level":2,"score":0.28929999470710754},{"id":"https://openalex.org/C32834561","wikidata":"https://www.wikidata.org/wiki/Q660730","display_name":"Subspace topology","level":2,"score":0.2888000011444092},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.28209999203681946},{"id":"https://openalex.org/C2779506182","wikidata":"https://www.wikidata.org/wiki/Q7580141","display_name":"Spotting","level":2,"score":0.274399995803833},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.265500009059906},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.25450000166893005}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.12070","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.12070","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.12070","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.12070","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Asynchronous":[0],"reinforcement":[1],"learning":[2],"improves":[3],"rollout":[4,154],"throughput":[5],"for":[6,26],"large":[7],"language":[8],"model":[9],"agents":[10],"by":[11],"decoupling":[12],"sample":[13],"generation":[14],"from":[15,70],"policy":[16,73,181],"optimization,":[17],"but":[18],"it":[19],"also":[20],"introduces":[21],"a":[22,47,63,146,177,202],"critical":[23],"failure":[24],"mode":[25],"PPO-style":[27],"off-policy":[28],"correction.":[29],"In":[30],"heterogeneous":[31],"training":[32,212],"systems,":[33],"the":[34,58,68,71,75,92,110,162,171],"total":[35],"importance":[36],"ratio":[37],"should":[38],"ideally":[39],"be":[40,187],"decomposed":[41],"into":[42],"two":[43],"semantically":[44],"distinct":[45],"factors:":[46],"\\emph{training--inference":[48],"discrepancy":[49,104],"term}":[50,65],"that":[51,66,80],"aligns":[52],"inference-side":[53],"and":[54,62,87,116,119,132,150,156,214],"training-side":[55,95],"distributions":[56],"at":[57,189],"same":[59],"behavior-policy":[60],"version,":[61],"\\emph{policy-staleness":[64],"constrains":[67],"update":[69],"historical":[72,94],"to":[74],"current":[76],"policy.":[77],"We":[78,136],"show":[79],"practical":[81],"asynchronous":[82],"pipelines":[83],"with":[84,106],"delayed":[85],"updates":[86],"partial":[88,153],"rollouts":[89],"often":[90],"lose":[91],"required":[93],"logits,":[96],"or":[97],"old":[98,184],"logits.":[99],"This":[100],"missing-old-logit":[101],"problem":[102],"entangles":[103],"repair":[105],"staleness":[107],"correction,":[108,115,166],"breaks":[109],"intended":[111],"semantics":[112],"of":[113,164,173],"decoupled":[114,174],"makes":[117],"clipping":[118],"masking":[120],"thresholds":[121],"interact":[122],"undesirably.":[123],"To":[124],"address":[125],"this":[126,198],"issue,":[127],"we":[128,167,200],"study":[129],"both":[130,211],"exact":[131,139,183],"approximate":[133,165,180],"correction":[134,175],"routes.":[135],"propose":[137],"three":[138],"old-logit":[140,148],"acquisition":[141],"strategies:":[142],"snapshot-based":[143],"version":[144],"tracking,":[145],"dedicated":[147],"model,":[149],"synchronization":[151],"via":[152],"interruption,":[155],"compare":[157],"their":[158],"system":[159,195],"trade-offs.":[160],"From":[161],"perspective":[163],"focus":[168],"on":[169],"preserving":[170],"benefits":[172],"through":[176],"more":[178],"appropriate":[179],"when":[182],"logits":[185],"cannot":[186],"recovered":[188],"low":[190],"cost,":[191],"without":[192],"incurring":[193],"extra":[194],"overhead.":[196],"Following":[197],"analysis,":[199],"adopt":[201],"revised":[203],"PPO-EWMA":[204],"method,":[205],"which":[206],"achieves":[207],"significant":[208],"gains":[209],"in":[210],"speed":[213],"optimization":[215],"performance.":[216]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-14T00:00:00"}
