{"id":"https://openalex.org/W7125388562","doi":"https://doi.org/10.48550/arxiv.2601.14952","title":"CorpusQA: A 10 Million Token Benchmark for Corpus-Level Analysis and Reasoning","display_name":"CorpusQA: A 10 Million Token Benchmark for Corpus-Level Analysis and Reasoning","publication_year":2026,"publication_date":"2026-01-21","ids":{"openalex":"https://openalex.org/W7125388562","doi":"https://doi.org/10.48550/arxiv.2601.14952"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.14952","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.14952","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.14952","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123621984","display_name":"Zhiyuan Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lu, Zhiyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123605254","display_name":"Chenliang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chenliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123604350","display_name":"Yingcheng Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Yingcheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024669409","display_name":"Weizhou Shen","orcid":"https://orcid.org/0000-0001-9180-0043"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Weizhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123560746","display_name":"Ming Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Ming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123603651","display_name":"Fei Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Fei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5123621984"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.4869999885559082,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.4869999885559082,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.14409999549388885,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.08709999918937683,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6299999952316284},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5608000159263611},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5322999954223633},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.41040000319480896},{"id":"https://openalex.org/keywords/decoupling","display_name":"Decoupling (probability)","score":0.3898000121116638},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.37380000948905945}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7605999708175659},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6299999952316284},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5608000159263611},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5322999954223633},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4408999979496002},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.41040000319480896},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.40400001406669617},{"id":"https://openalex.org/C205606062","wikidata":"https://www.wikidata.org/wiki/Q5249645","display_name":"Decoupling (probability)","level":2,"score":0.3898000121116638},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.37380000948905945},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34880000352859497},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3431999981403351},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.30300000309944153},{"id":"https://openalex.org/C2985684807","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Text generation","level":2,"score":0.299699991941452},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.29260000586509705},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.25600001215934753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.14952","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.14952","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.14952","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.14952","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"large":[1],"language":[2],"models":[3],"now":[4],"handle":[5],"million-token":[6],"contexts,":[7],"their":[8],"capacity":[9],"for":[10,51,202],"reasoning":[11,99,119,153],"across":[12,60],"entire":[13],"document":[14],"repositories":[15],"remains":[16],"largely":[17],"untested.":[18],"Existing":[19],"benchmarks":[20],"are":[21,25],"inadequate,":[22],"as":[23,164],"they":[24],"mostly":[26],"limited":[27],"to":[28,86,116,198],"single":[29],"long":[30],"texts":[31],"or":[32],"rely":[33],"on":[34,126,143],"a":[35,44,81,92,183,188],"\"sparse":[36],"retrieval\"":[37],"assumption-that":[38],"answers":[39,65],"can":[40],"be":[41],"derived":[42],"from":[43,100,193],"few":[45],"relevant":[46],"chunks.":[47],"This":[48],"assumption":[49],"fails":[50],"true":[52],"corpus-level":[53],"analysis,":[54],"where":[55],"evidence":[56],"is":[57,191],"highly":[58],"dispersed":[59],"hundreds":[61],"of":[62,135],"documents":[63],"and":[64,70,168],"require":[66],"global":[67,203],"integration,":[68],"comparison,":[69],"statistical":[71],"aggregation.":[72],"To":[73],"address":[74],"this":[75,103],"critical":[76,189],"gap,":[77],"we":[78],"introduce":[79],"CorpusQA,":[80],"new":[82],"benchmark":[83],"scaling":[84],"up":[85],"10":[87],"million":[88],"tokens,":[89],"generated":[90],"via":[91],"novel":[93],"data":[94,146],"synthesis":[95],"framework.":[96],"By":[97],"decoupling":[98],"textual":[101],"representation,":[102],"framework":[104,137],"creates":[105],"complex,":[106],"computation-intensive":[107],"queries":[108],"with":[109],"programmatically":[110],"guaranteed":[111],"ground-truth":[112],"answers,":[113],"challenging":[114],"systems":[115,172],"perform":[117],"holistic":[118],"over":[120],"vast,":[121],"unstructured":[122],"text":[123],"without":[124],"relying":[125],"fallible":[127],"human":[128],"annotation.":[129],"We":[130],"further":[131],"demonstrate":[132],"the":[133],"utility":[134],"our":[136,144],"beyond":[138],"evaluation,":[139],"showing":[140],"that":[141,158,178],"fine-tuning":[142],"synthesized":[145],"effectively":[147],"enhances":[148],"an":[149],"LLM's":[150],"general":[151],"long-context":[152,161],"capabilities.":[154],"Extensive":[155],"experiments":[156],"reveal":[157],"even":[159],"state-of-the-art":[160],"LLMs":[162],"struggle":[163],"input":[165],"length":[166],"increases,":[167],"standard":[169],"retrieval-augmented":[170],"generation":[171],"collapse":[173],"entirely.":[174],"Our":[175],"findings":[176],"indicate":[177],"memory-augmented":[179],"agentic":[180],"architectures":[181,201],"offer":[182],"more":[184],"robust":[185],"alternative,":[186],"suggesting":[187],"shift":[190],"needed":[192],"simply":[194],"extending":[195],"context":[196],"windows":[197],"developing":[199],"advanced":[200],"information":[204],"synthesis.":[205]},"counts_by_year":[],"updated_date":"2026-01-23T23:24:52.574035","created_date":"2026-01-23T00:00:00"}
