{"id":"https://openalex.org/W7131424306","doi":"https://doi.org/10.48550/arxiv.2602.19526","title":"How to Train Your Deep Research Agent? Prompt, Reward, and Policy Optimization in Search-R1","display_name":"How to Train Your Deep Research Agent? Prompt, Reward, and Policy Optimization in Search-R1","publication_year":2026,"publication_date":"2026-02-23","ids":{"openalex":"https://openalex.org/W7131424306","doi":"https://doi.org/10.48550/arxiv.2602.19526"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.19526","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19526","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.19526","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060292278","display_name":"Yinuo Xu","orcid":"https://orcid.org/0009-0008-2414-3424"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xu, Yinuo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126853578","display_name":"Shuo Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Shuo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122307000","display_name":"Jianjie Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Jianjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126816087","display_name":"Meng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Meng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126839837","display_name":"Qianlong Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Qianlong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126789311","display_name":"Xingxing Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xingxing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126828051","display_name":"Ran He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Ran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126817623","display_name":"Jian Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Jian","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5060292278"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.1356000006198883,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.1356000006198883,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.12549999356269836,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.11079999804496765,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7498000264167786},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.6947000026702881},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.6247000098228455},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.49160000681877136},{"id":"https://openalex.org/keywords/policy-learning","display_name":"Policy learning","score":0.40630000829696655}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7498000264167786},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.6947000026702881},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.6247000098228455},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6075000166893005},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5776000022888184},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.49160000681877136},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.45419999957084656},{"id":"https://openalex.org/C2779436431","wikidata":"https://www.wikidata.org/wiki/Q30672407","display_name":"Policy learning","level":2,"score":0.40630000829696655},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.3109999895095825},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26750001311302185},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.2669999897480011},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.25929999351501465}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.19526","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19526","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.19526","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19526","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Deep":[0,166],"Research":[1,167],"agents":[2],"tackle":[3],"knowledge-intensive":[4],"tasks":[5],"through":[6],"multi-round":[7],"retrieval":[8],"and":[9,49,64,143,160],"decision-oriented":[10],"generation.":[11],"While":[12],"reinforcement":[13],"learning":[14],"(RL)":[15],"has":[16],"been":[17],"shown":[18],"to":[19,84,140,145],"improve":[20],"performance":[21,66,135],"in":[22,73,165],"this":[23,91],"paradigm,":[24],"its":[25],"contributions":[26],"remain":[27],"underexplored.":[28],"To":[29],"fully":[30],"understand":[31],"the":[32,57,68,77,81,114,134,155],"role":[33],"of":[34,136],"RL,":[35],"we":[36,125],"conduct":[37],"a":[38,129],"systematic":[39],"study":[40,53],"along":[41],"three":[42],"decoupled":[43],"dimensions:":[44],"prompt":[45],"template,":[46],"reward":[47,79],"function,":[48],"policy":[50,118],"optimization.":[51],"Our":[52],"reveals":[54],"that:":[55],"1)":[56],"Fast":[58],"Thinking":[59,70],"template":[60,71],"yields":[61],"greater":[62],"stability":[63,116],"better":[65],"than":[67],"Slow":[69],"used":[72],"prior":[74],"work;":[75],"2)":[76],"F1-based":[78],"underperforms":[80],"EM":[82],"due":[83],"training":[85,163],"collapse":[86],"driven":[87],"by":[88,95],"answer":[89],"avoidance;":[90],"can":[92,153],"be":[93],"mitigated":[94],"incorporating":[96],"action-level":[97],"penalties,":[98],"ultimately":[99],"surpassing":[100],"EM;":[101],"3)":[102],"REINFORCE":[103],"outperforms":[104],"PPO":[105],"while":[106],"requiring":[107],"fewer":[108],"search":[109],"actions,":[110],"whereas":[111],"GRPO":[112],"shows":[113],"poorest":[115],"among":[117],"optimization":[119],"methods.":[120],"Building":[121],"on":[122],"these":[123],"insights,":[124],"then":[126],"introduce":[127],"Search-R1++,":[128],"strong":[130],"baseline":[131],"that":[132,150],"improves":[133],"Search-R1":[137],"from":[138],"0.403":[139],"0.442":[141],"(Qwen2.5-7B)":[142],"0.289":[144],"0.331":[146],"(Qwen2.5-3B).":[147],"We":[148],"hope":[149],"our":[151],"findings":[152],"pave":[154],"way":[156],"for":[157],"more":[158],"principled":[159],"reliable":[161],"RL":[162],"strategies":[164],"systems.":[168]},"counts_by_year":[],"updated_date":"2026-02-26T06:34:08.959763","created_date":"2026-02-26T00:00:00"}
