{"id":"https://openalex.org/W7139924294","doi":"https://doi.org/10.48550/arxiv.2603.18683","title":"HISR: Hindsight Information Modulated Segmental Process Rewards For Multi-turn Agentic Reinforcement Learning","display_name":"HISR: Hindsight Information Modulated Segmental Process Rewards For Multi-turn Agentic Reinforcement Learning","publication_year":2026,"publication_date":"2026-03-19","ids":{"openalex":"https://openalex.org/W7139924294","doi":"https://doi.org/10.48550/arxiv.2603.18683"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.18683","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18683","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.18683","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130246231","display_name":"Zhicong Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Zhicong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130229134","display_name":"Zichuan Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Zichuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130215960","display_name":"Wei Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052887619","display_name":"Changyuan Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Changyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130215384","display_name":"Deheng Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Deheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130245998","display_name":"Peiguang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Peiguang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130234573","display_name":"Li Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130237230","display_name":"Nayu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Nayu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057853630","display_name":"Guangluan Xu","orcid":"https://orcid.org/0000-0003-3529-593X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Guangluan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130222100","display_name":"Wei Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.45010000467300415,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.45010000467300415,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.08330000191926956,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.06210000067949295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hindsight-bias","display_name":"Hindsight bias","score":0.982699990272522},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.7192000150680542},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6743000149726868},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.6273000240325928},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5483999848365784},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5303999781608582},{"id":"https://openalex.org/keywords/outcome","display_name":"Outcome (game theory)","score":0.4724999964237213},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.45840001106262207}],"concepts":[{"id":"https://openalex.org/C10347200","wikidata":"https://www.wikidata.org/wiki/Q1960297","display_name":"Hindsight bias","level":2,"score":0.982699990272522},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.7192000150680542},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6743000149726868},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.6273000240325928},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5483999848365784},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5401999950408936},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5303999781608582},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.4724999964237213},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.45840001106262207},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45170000195503235},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.44200000166893005},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.42399999499320984},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.4235999882221222},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4032000005245209},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.40149998664855957},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.391400009393692},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.37770000100135803},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.36550000309944153},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.30820000171661377},{"id":"https://openalex.org/C59656382","wikidata":"https://www.wikidata.org/wiki/Q191536","display_name":"Conjunction (astronomy)","level":2,"score":0.30709999799728394},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C196340769","wikidata":"https://www.wikidata.org/wiki/Q7698910","display_name":"Temporal difference learning","level":3,"score":0.2824999988079071}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.18683","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18683","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.18683","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18683","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8047254085540771,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"large":[1],"language":[2],"models":[3,26],"excel":[4],"in":[5,41,102,115,167],"diverse":[6],"domains,":[7],"their":[8],"performance":[9,30],"on":[10,22,180],"complex":[11],"longhorizon":[12],"agentic":[13],"decision-making":[14],"tasks":[15],"remains":[16],"limited.":[17],"Most":[18],"existing":[19],"methods":[20],"concentrate":[21],"designing":[23],"effective":[24],"reward":[25],"(RMs)":[27],"to":[28,67,82,96,109,123,152,161],"advance":[29],"via":[31],"multi-turn":[32],"reinforcement":[33],"learning.":[34],"However,":[35],"they":[36],"suffer":[37],"from":[38],"delayed":[39],"propagation":[40],"sparse":[42],"outcome":[43],"rewards":[44,75,98],"and":[45,53,78,149],"unreliable":[46],"credit":[47,87,174],"assignment":[48,175],"with":[49,76],"potentially":[50],"overly":[51],"fine-grained":[52],"unfocused":[54],"turnlevel":[55],"process":[56,70,92,171],"rewards.":[57],"In":[58],"this":[59,138],"paper,":[60],"we":[61,140],"propose":[62],"(HISR)":[63],"exploiting":[64],"Hindsight":[65],"Information":[66],"modulate":[68,169],"Segmental":[69],"Rewards,":[71],"which":[72,166],"closely":[73],"aligns":[74],"sub-goals":[77],"underscores":[79],"significant":[80,113],"segments":[81,114],"enhance":[83],"the":[84,103,116,125,134,142,185],"reliability":[85],"of":[86,127,144,187],"assignment.":[88],"Specifically,":[89],"a":[90,118,129],"segment-level":[91],"RM":[93],"is":[94,121],"presented":[95],"assign":[97],"for":[99],"each":[100],"sub-goal":[101],"task,":[104],"avoiding":[105],"excessively":[106],"granular":[107],"allocation":[108],"turns.":[110],"To":[111],"emphasize":[112],"trajectory,":[117],"hindsight":[119,148],"model":[120,151],"devised":[122],"reflect":[124],"preference":[126],"performing":[128],"certain":[130],"action":[131,154],"after":[132],"knowing":[133],"trajectory":[135],"outcome.":[136],"With":[137],"characteristic,":[139],"design":[141],"ratios":[143,157],"sequence":[145],"likelihoods":[146],"between":[147],"policy":[150],"measure":[153],"importance.":[155],"The":[156],"are":[158],"subsequently":[159],"employed":[160],"aggregate":[162],"segment":[163],"importance":[164],"scores,":[165],"turn":[168],"segmental":[170],"rewards,":[172],"enhancing":[173],"reliability.":[176],"Extensive":[177],"experimental":[178],"results":[179],"three":[181],"publicly":[182],"benchmarks":[183],"demonstrate":[184],"validity":[186],"our":[188],"method.":[189]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-21T00:00:00"}
