{"id":"https://openalex.org/W7161996508","doi":"https://doi.org/10.48550/arxiv.2605.21482","title":"DeepWeb-Bench: A Deep Research Benchmark Demanding Massive Cross-Source Evidence and Long-Horizon Derivation","display_name":"DeepWeb-Bench: A Deep Research Benchmark Demanding Massive Cross-Source Evidence and Long-Horizon Derivation","publication_year":2026,"publication_date":"2026-05-20","ids":{"openalex":"https://openalex.org/W7161996508","doi":"https://doi.org/10.48550/arxiv.2605.21482"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.21482","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.21482","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.21482","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079578592","display_name":"Sixiong Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Sixiong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136709989","display_name":"Zhuofan Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Zhuofan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136717817","display_name":"Haiyang Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Haiyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136712999","display_name":"Jiuzheng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jiuzheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136722877","display_name":"Siqi Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Siqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136629728","display_name":"Mugeng Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Mugeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044039411","display_name":"C PAN","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Chongyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123171206","display_name":"Peilun Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia, Peilun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136710788","display_name":"Baoqing Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Baoqing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136141839","display_name":"Xiang Jing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jing, Xiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136614018","display_name":"Yun Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Yun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.15049999952316284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.15049999952316284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.133200004696846,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.11729999631643295,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8590999841690063},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6676999926567078},{"id":"https://openalex.org/keywords/calibration","display_name":"Calibration","score":0.47850000858306885},{"id":"https://openalex.org/keywords/frontier","display_name":"Frontier","score":0.46140000224113464},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.44530001282691956},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.3871999979019165}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8590999841690063},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6700999736785889},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6676999926567078},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5022000074386597},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.47850000858306885},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.46140000224113464},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.44530001282691956},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.426800012588501},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.3871999979019165},{"id":"https://openalex.org/C2911011789","wikidata":"https://www.wikidata.org/wiki/Q130741","display_name":"Hallucinating","level":2,"score":0.36559998989105225},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3370000123977661},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3158999979496002},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.30480000376701355},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.29350000619888306},{"id":"https://openalex.org/C197947376","wikidata":"https://www.wikidata.org/wiki/Q5155608","display_name":"Comparability","level":2,"score":0.29120001196861267},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.2833000123500824},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.27160000801086426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.21482","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.21482","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.21482","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.21482","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Deep":[0],"research,":[1],"in":[2,178],"which":[3],"an":[4,14],"agent":[5],"searches":[6],"the":[7,64,73,134,152,224],"open":[8],"web,":[9],"collects":[10],"evidence,":[11],"and":[12,84,102,104,123,144,165,174,190,196,212,227],"derives":[13],"answer":[15,112],"through":[16],"extended":[17],"reasoning,":[18],"is":[19,57,113,150],"a":[20,52,116],"prominent":[21],"use":[22],"case":[23],"for":[24,63,158,169],"frontier":[25,142],"language":[26],"models.":[27],"Frontier":[28],"deep":[29,53],"research":[30,54],"products":[31],"score":[32],"high":[33],"on":[34,140],"existing":[35,61],"benchmarks,":[36],"making":[37,128],"it":[38],"difficult":[39],"to":[40,131],"distinguish":[41],"their":[42],"capabilities":[43],"from":[44,69],"current":[45,65],"evaluation":[46,228],"data":[47,74],"alone.":[48],"We":[49,88,137],"introduce":[50],"DeepWeb-Bench,":[51],"benchmark":[55,221],"that":[56],"substantially":[58],"harder":[59],"than":[60],"benchmarks":[62],"frontier.":[66],"Difficulty":[67],"comes":[68],"three":[70,91,146],"properties":[71],"of":[72,93,161,207],"itself:":[75],"each":[76],"task":[77],"requires":[78],"massive":[79],"evidence":[80],"collection,":[81],"cross-source":[82,124],"reconciliation,":[83],"long-horizon":[85],"multi-step":[86],"derivation.":[87],"represent":[89],"these":[90],"sources":[92],"difficulty":[94],"as":[95,154],"four":[96,120],"capability":[97],"families":[98],"(Retrieval,":[99],"Derivation,":[100],"Reasoning,":[101],"Calibration)":[103],"report":[105,145],"results":[106],"sliced":[107],"by":[108,115,187,193],"family.":[109],"Every":[110],"reference":[111],"accompanied":[114],"source-provenance":[117],"record":[118],"with":[119,182,204],"disclosure":[121],"levels":[122],"checks":[125],"where":[126],"available,":[127],"scores":[129],"easier":[130],"audit":[132],"against":[133],"underlying":[135],"evidence.":[136],"evaluate":[138],"DeepWeb-Bench":[139],"nine":[141],"models":[143,176,198],"findings:":[147],"(1)":[148],"retrieval":[149,155],"not":[151],"bottleneck,":[153],"failures":[156,167],"account":[157,168],"only":[159,208],"12-14%":[160],"errors":[162,185],"while":[163],"derivation":[164,189],"calibration":[166],"over":[170],"70%;":[171],"(2)":[172],"strong":[173,183],"weak":[175,191],"fail":[177],"qualitatively":[179],"different":[180],"ways,":[181],"models'":[184,192],"dominated":[186],"incomplete":[188],"hallucinated":[194],"precision;":[195],"(3)":[197],"exhibit":[199],"genuine":[200],"specialization":[201],"across":[202],"domains,":[203],"cross-model":[205],"agreement":[206],"rho":[209],"=":[210],"0.61":[211],"per-case":[213],"disagreement":[214],"reaching":[215],"18.8":[216],"percentage":[217],"points.":[218],"The":[219],"public":[220],"release":[222],"includes":[223],"data,":[225],"rubrics,":[226],"code.":[229]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-22T00:00:00"}
