{"id":"https://openalex.org/W7130728993","doi":"https://doi.org/10.48550/arxiv.2602.17497","title":"Retrospective In-Context Learning for Temporal Credit Assignment with Large Language Models","display_name":"Retrospective In-Context Learning for Temporal Credit Assignment with Large Language Models","publication_year":2026,"publication_date":"2026-02-19","ids":{"openalex":"https://openalex.org/W7130728993","doi":"https://doi.org/10.48550/arxiv.2602.17497"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.17497","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113084765","display_name":"Wentse Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Wen-Tse","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126482811","display_name":"Jiayu Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiayu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046483574","display_name":"Fahim Tajwar","orcid":"https://orcid.org/0000-0001-9257-6282"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tajwar, Fahim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126461266","display_name":"Hao Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126448043","display_name":"Xintong Duan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Duan, Xintong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126501768","display_name":"Ruslan Salakhutdinov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Salakhutdinov, Ruslan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Schneider, Jeff","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schneider, Jeff","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5113084765"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.4244999885559082,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.4244999885559082,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.07000000029802322,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.06430000066757202,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.8709999918937683},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.531499981880188},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.4318999946117401},{"id":"https://openalex.org/keywords/credit-risk","display_name":"Credit risk","score":0.39660000801086426},{"id":"https://openalex.org/keywords/temporal-difference-learning","display_name":"Temporal difference learning","score":0.38960000872612},{"id":"https://openalex.org/keywords/semi-supervised-learning","display_name":"Semi-supervised learning","score":0.32330000400543213},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3165000081062317}],"concepts":[{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.8709999918937683},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6978999972343445},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5914999842643738},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5748000144958496},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.531499981880188},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.4318999946117401},{"id":"https://openalex.org/C178350159","wikidata":"https://www.wikidata.org/wiki/Q162714","display_name":"Credit risk","level":2,"score":0.39660000801086426},{"id":"https://openalex.org/C196340769","wikidata":"https://www.wikidata.org/wiki/Q7698910","display_name":"Temporal difference learning","level":3,"score":0.38960000872612},{"id":"https://openalex.org/C58973888","wikidata":"https://www.wikidata.org/wiki/Q1041418","display_name":"Semi-supervised learning","level":2,"score":0.32330000400543213},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3165000081062317},{"id":"https://openalex.org/C77967617","wikidata":"https://www.wikidata.org/wiki/Q4677561","display_name":"Active learning (machine learning)","level":2,"score":0.28139999508857727},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.2808000147342682},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.2777999937534332},{"id":"https://openalex.org/C2986087404","wikidata":"https://www.wikidata.org/wiki/Q15946010","display_name":"Online learning","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C2776291640","wikidata":"https://www.wikidata.org/wiki/Q2912517","display_name":"Value (mathematics)","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C24138899","wikidata":"https://www.wikidata.org/wiki/Q17141258","display_name":"Instance-based learning","level":3,"score":0.2535000145435333}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.17497","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.17497","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.17497","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.17497","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Learning":[0],"from":[1,45,61,102],"self-sampled":[2],"data":[3],"and":[4,49,118,171],"sparse":[5,24,68],"environmental":[6],"feedback":[7,25],"remains":[8],"a":[9],"fundamental":[10],"challenge":[11],"in":[12,123],"training":[13,72],"self-evolving":[14],"agents.":[15],"Temporal":[16],"credit":[17,41,99,128,163],"assignment":[18,100],"mitigates":[19],"this":[20,53],"issue":[21],"by":[22],"transforming":[23],"into":[26,70],"dense":[27,71],"supervision":[28],"signals.":[29],"However,":[30],"previous":[31],"approaches":[32],"typically":[33],"depend":[34],"on":[35,97,132],"learning":[36,81,88],"task-specific":[37],"value":[38],"functions":[39],"for":[40,126,161,168],"assignment,":[42,164],"which":[43,91],"suffer":[44],"poor":[46],"sample":[47,151],"efficiency":[48],"limited":[50,116],"generalization.":[51],"In":[52],"work,":[54],"we":[55],"propose":[56,85],"to":[57,66],"leverage":[58],"pretrained":[59],"knowledge":[60],"large":[62],"language":[63],"models":[64],"(LLMs)":[65],"transform":[67],"rewards":[69],"signals":[73],"(i.e.,":[74],"the":[75,94,98,112,124,156,166],"advantage":[76,113],"function)":[77],"through":[78],"retrospective":[79],"in-context":[80],"(RICL).":[82],"We":[83,104],"further":[84],"an":[86],"online":[87,145],"framework,":[89],"RICOL,":[90],"iteratively":[92],"refines":[93],"policy":[95],"based":[96],"results":[101],"RICL.":[103],"empirically":[105],"demonstrate":[106],"that":[107,137],"RICL":[108],"can":[109],"accurately":[110],"estimate":[111],"function":[114],"with":[115,143,148],"samples":[117],"effectively":[119],"identify":[120],"critical":[121],"states":[122],"environment":[125],"temporal":[127,162],"assignment.":[129],"Extended":[130],"evaluation":[131],"four":[133],"BabyAI":[134],"scenarios":[135],"show":[136],"RICOL":[138],"achieves":[139],"comparable":[140],"convergent":[141],"performance":[142],"traditional":[144],"RL":[146,173],"algorithms":[147],"significantly":[149],"higher":[150],"efficiency.":[152],"Our":[153],"findings":[154],"highlight":[155],"potential":[157],"of":[158],"leveraging":[159],"LLMs":[160],"paving":[165],"way":[167],"more":[169],"sample-efficient":[170],"generalizable":[172],"paradigms.":[174]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-21T00:00:00"}
