{"id":"https://openalex.org/W7127606770","doi":"https://doi.org/10.48550/arxiv.2602.03645","title":"Reinforcement Fine-Tuning for History-Aware Dense Retriever in RAG","display_name":"Reinforcement Fine-Tuning for History-Aware Dense Retriever in RAG","publication_year":2026,"publication_date":"2026-02-03","ids":{"openalex":"https://openalex.org/W7127606770","doi":"https://doi.org/10.48550/arxiv.2602.03645"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.03645","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124971679","display_name":"Yicheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Yicheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125012566","display_name":"Zhen Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Zhen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122573241","display_name":"Zhaomin Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zhaomin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124995094","display_name":"Wenqi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Wenqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124976566","display_name":"Shuiguang Deng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Shuiguang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5124971679"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.6157000064849854,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.6157000064849854,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.17790000140666962,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.08070000261068344,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/labrador-retriever","display_name":"Labrador Retriever","score":0.7317000031471252},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.49000000953674316},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.4318000078201294},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.4187999963760376},{"id":"https://openalex.org/keywords/aliasing","display_name":"Aliasing","score":0.40869998931884766},{"id":"https://openalex.org/keywords/optimization-problem","display_name":"Optimization problem","score":0.3578999936580658}],"concepts":[{"id":"https://openalex.org/C2910597052","wikidata":"https://www.wikidata.org/wiki/Q38726","display_name":"Labrador Retriever","level":2,"score":0.7317000031471252},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5792999863624573},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.49000000953674316},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4805999994277954},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.4318000078201294},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.4187999963760376},{"id":"https://openalex.org/C4069607","wikidata":"https://www.wikidata.org/wiki/Q868732","display_name":"Aliasing","level":3,"score":0.40869998931884766},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.3578999936580658},{"id":"https://openalex.org/C98763669","wikidata":"https://www.wikidata.org/wiki/Q176645","display_name":"Markov chain","level":2,"score":0.3384000062942505},{"id":"https://openalex.org/C106189395","wikidata":"https://www.wikidata.org/wiki/Q176789","display_name":"Markov decision process","level":3,"score":0.3192000091075897},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29840001463890076},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.29589998722076416},{"id":"https://openalex.org/C159886148","wikidata":"https://www.wikidata.org/wiki/Q176645","display_name":"Markov process","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.28619998693466187},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.26910001039505005},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26510000228881836},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.25839999318122864}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.03645","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.03645","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.03645","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.03645","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.74847811460495}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Retrieval-augmented":[0],"generation":[1],"(RAG)":[2],"enables":[3],"large":[4],"language":[5],"models":[6],"(LLMs)":[7],"to":[8,32,58,65,128],"produce":[9],"evidence-based":[10],"responses,":[11],"and":[12,22,45,81,103,139],"its":[13],"performance":[14],"hinges":[15],"on":[16],"the":[17,20,46,73,122],"matching":[18],"between":[19,42],"retriever":[21,43,66,112,140],"LLMs.":[23,34],"Retriever":[24],"optimization":[25,44,67],"has":[26],"emerged":[27],"as":[28,106],"an":[29],"efficient":[30],"alternative":[31],"fine-tuning":[33],"However,":[35],"existing":[36],"solutions":[37],"suffer":[38],"from":[39,86],"objective":[40],"mismatch":[41],"goal":[47],"of":[48,145],"RAG":[49,105,136,149],"pipeline.":[50],"Reinforcement":[51],"learning":[52],"(RL)":[53],"provides":[54],"a":[55,107],"promising":[56],"solution":[57],"address":[59,93],"this":[60],"limitation,":[61],"yet":[62],"applying":[63],"RL":[64,79],"introduces":[68],"two":[69],"fundamental":[70],"challenges:":[71],"1)":[72],"deterministic":[74,98],"retrieval":[75,88,99,119,126],"is":[76],"incompatible":[77],"with":[78,100],"formulations,":[80],"2)":[82],"state":[83,123,130],"aliasing":[84],"arises":[85],"query-only":[87],"in":[89,148],"multi-hop":[90],"reasoning.":[91],"To":[92],"these":[94],"challenges,":[95],"we":[96,117],"replace":[97],"stochastic":[101],"sampling":[102],"formulate":[104],"Markov":[108],"decision":[109],"process,":[110],"making":[111],"optimizable":[113],"by":[114],"RL.":[115],"Further,":[116],"incorporate":[118],"history":[120],"into":[121],"at":[124],"each":[125],"step":[127],"mitigate":[129],"aliasing.":[131],"Extensive":[132],"experiments":[133],"across":[134],"diverse":[135],"pipelines,":[137],"datasets,":[138],"scales":[141],"demonstrate":[142],"consistent":[143],"improvements":[144],"our":[146],"approach":[147],"performance.":[150]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-06T00:00:00"}
