{"id":"https://openalex.org/W7125790422","doi":"https://doi.org/10.48550/arxiv.2601.18467","title":"OffSeeker: Online Reinforcement Learning Is Not All You Need for Deep Research Agents","display_name":"OffSeeker: Online Reinforcement Learning Is Not All You Need for Deep Research Agents","publication_year":2026,"publication_date":"2026-01-26","ids":{"openalex":"https://openalex.org/W7125790422","doi":"https://doi.org/10.48550/arxiv.2601.18467"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.18467","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123924468","display_name":"Yuhang Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhou, Yuhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123952853","display_name":"Kai Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123920291","display_name":"Qiguang Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Qiguang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123998522","display_name":"Mengkang Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Mengkang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123908017","display_name":"Qingfeng Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Qingfeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123896758","display_name":"Can Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Can","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123984253","display_name":"Jingjing Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jingjing","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5123924468"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.23170000314712524,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.23170000314712524,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.09319999814033508,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.07580000162124634,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.8266000151634216},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.7111999988555908},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6467999815940857},{"id":"https://openalex.org/keywords/scarcity","display_name":"Scarcity","score":0.541700005531311},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.4577000141143799},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.45410001277923584},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4212999939918518}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.8266000151634216},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.7111999988555908},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6884999871253967},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6467999815940857},{"id":"https://openalex.org/C109747225","wikidata":"https://www.wikidata.org/wiki/Q815758","display_name":"Scarcity","level":2,"score":0.541700005531311},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.504800021648407},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.4577000141143799},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.45410001277923584},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4212999939918518},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.41600000858306885},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3732999861240387},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.36010000109672546},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.32850000262260437},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.32580000162124634},{"id":"https://openalex.org/C2780102126","wikidata":"https://www.wikidata.org/wiki/Q10928179","display_name":"Online and offline","level":2,"score":0.3116999864578247},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2971000075340271},{"id":"https://openalex.org/C2986087404","wikidata":"https://www.wikidata.org/wiki/Q15946010","display_name":"Online learning","level":2,"score":0.2768999934196472},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2687000036239624}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.18467","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.18467","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.18467","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.18467","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.449138879776001,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Deep":[0],"research":[1,47,67,97],"agents":[2,142],"have":[3],"shown":[4],"remarkable":[5],"potential":[6],"in":[7],"handling":[8],"long-horizon":[9],"tasks.":[10],"However,":[11],"state-of-the-art":[12],"performance":[13],"typically":[14],"relies":[15],"on":[16],"online":[17,56,153],"reinforcement":[18,57],"learning":[19,58],"(RL),":[20],"which":[21],"is":[22,40,59],"financially":[23],"expensive":[24,55],"due":[25],"to":[26,64],"extensive":[27],"API":[28],"calls.":[29],"While":[30],"offline":[31,82],"training":[32],"offers":[33],"a":[34,75,89,103,124],"more":[35],"efficient":[36],"alternative,":[37],"its":[38],"progress":[39],"hindered":[41],"by":[42],"the":[43],"scarcity":[44],"of":[45,106],"high-quality":[46],"trajectories.":[48],"In":[49],"this":[50,71],"paper,":[51],"we":[52,73,120],"demonstrate":[53],"that":[54,94,135],"not":[60,137],"all":[61],"you":[62],"need":[63],"build":[65],"powerful":[66],"agents.":[68],"To":[69],"bridge":[70],"gap,":[72],"introduce":[74],"fully":[76],"open-source":[77],"suite":[78],"designed":[79],"for":[80],"effective":[81],"training.":[83],"Our":[84],"core":[85],"contributions":[86],"include":[87],"DeepForge,":[88],"ready-to-use":[90],"task":[91],"synthesis":[92],"framework":[93],"generates":[95],"large-scale":[96],"queries":[98],"without":[99],"heavy":[100,152],"preprocessing;":[101],"and":[102,113],"curated":[104],"collection":[105],"66k":[107],"QA":[108],"pairs,":[109],"33k":[110],"SFT":[111],"trajectories,":[112],"21k":[114],"DPO":[115],"pairs.":[116],"Leveraging":[117],"these":[118],"resources,":[119],"train":[121],"OffSeeker":[122,136],"(8B),":[123],"model":[125],"developed":[126],"entirely":[127],"offline.":[128],"Extensive":[129],"evaluations":[130],"across":[131],"six":[132],"benchmarks":[133],"show":[134],"only":[138],"leads":[139],"among":[140],"similar-sized":[141],"but":[143],"also":[144],"remains":[145],"competitive":[146],"with":[147],"30B-parameter":[148],"systems":[149],"trained":[150],"via":[151],"RL.":[154]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-01-28T00:00:00"}
