{"id":"https://openalex.org/W7155407227","doi":"https://doi.org/10.48550/arxiv.2604.20763","title":"Coverage, Not Averages: Semantic Stratification for Trustworthy Retrieval Evaluation","display_name":"Coverage, Not Averages: Semantic Stratification for Trustworthy Retrieval Evaluation","publication_year":2026,"publication_date":"2026-04-22","ids":{"openalex":"https://openalex.org/W7155407227","doi":"https://doi.org/10.48550/arxiv.2604.20763"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.20763","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20763","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.20763","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134367040","display_name":"Andrew Klearman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Klearman, Andrew","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075140860","display_name":"Radu Revutchi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Revutchi, Radu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030111200","display_name":"Rohin Garg","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Garg, Rohin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030014440","display_name":"Rishav Chakravarti","orcid":"https://orcid.org/0000-0002-1612-8231"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chakravarti, Rishav","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074406404","display_name":"Samuel Marc Denton","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Denton, Samuel Marc","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5028544932","display_name":"Yuan Xue","orcid":"https://orcid.org/0000-0002-5390-9037"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Yuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9735999703407288,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9735999703407288,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.005400000140070915,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.002400000113993883,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.526199996471405},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.478300005197525},{"id":"https://openalex.org/keywords/document-retrieval","display_name":"Document retrieval","score":0.3828999996185303},{"id":"https://openalex.org/keywords/trustworthiness","display_name":"Trustworthiness","score":0.3804999887943268},{"id":"https://openalex.org/keywords/vector-space-model","display_name":"Vector space model","score":0.37869998812675476},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.3684000074863434},{"id":"https://openalex.org/keywords/aggregate","display_name":"Aggregate (composite)","score":0.35370001196861267},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.34049999713897705},{"id":"https://openalex.org/keywords/missing-data","display_name":"Missing data","score":0.33379998803138733}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7764000296592712},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5652999877929688},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.526199996471405},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.47850000858306885},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.478300005197525},{"id":"https://openalex.org/C161156560","wikidata":"https://www.wikidata.org/wiki/Q1638872","display_name":"Document retrieval","level":2,"score":0.3828999996185303},{"id":"https://openalex.org/C153701036","wikidata":"https://www.wikidata.org/wiki/Q659974","display_name":"Trustworthiness","level":2,"score":0.3804999887943268},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3797999918460846},{"id":"https://openalex.org/C89686163","wikidata":"https://www.wikidata.org/wiki/Q1187982","display_name":"Vector space model","level":2,"score":0.37869998812675476},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.3684000074863434},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.35370001196861267},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.34049999713897705},{"id":"https://openalex.org/C9357733","wikidata":"https://www.wikidata.org/wiki/Q6878417","display_name":"Missing data","level":2,"score":0.33379998803138733},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.3301999866962433},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.3294000029563904},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3280999958515167},{"id":"https://openalex.org/C2778180026","wikidata":"https://www.wikidata.org/wiki/Q18378163","display_name":"Semantic heterogeneity","level":4,"score":0.299699991941452},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28450000286102295},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.2802000045776367},{"id":"https://openalex.org/C551230270","wikidata":"https://www.wikidata.org/wiki/Q4368942","display_name":"Data retrieval","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C22639730","wikidata":"https://www.wikidata.org/wiki/Q7702546","display_name":"Term Discrimination","level":5,"score":0.273499995470047},{"id":"https://openalex.org/C99016210","wikidata":"https://www.wikidata.org/wiki/Q5488129","display_name":"Query expansion","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C149189445","wikidata":"https://www.wikidata.org/wiki/Q5283894","display_name":"Divergence-from-randomness model","level":3,"score":0.2644999921321869},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.2605000138282776},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C123403432","wikidata":"https://www.wikidata.org/wiki/Q654068","display_name":"Visibility","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C2986420190","wikidata":"https://www.wikidata.org/wiki/Q39045939","display_name":"Semantic space","level":2,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.20763","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20763","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.20763","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20763","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8011918067932129,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Retrieval":[0],"quality":[1],"is":[2,41],"the":[3,45],"primary":[4],"bottleneck":[5],"for":[6,74],"accuracy":[7],"and":[8,70,87,99,120,128],"robustness":[9],"in":[10,56,117],"retrieval-augmented":[11],"generation":[12],"(RAG).":[13],"Current":[14],"evaluation":[15,31,55,124],"relies":[16],"on":[17],"heuristically":[18],"constructed":[19],"query":[20],"sets,":[21],"which":[22,53],"introduce":[23,50],"a":[24,33],"hidden":[25],"intrinsic":[26],"bias.":[27],"We":[28,48],"formalize":[29],"retrieval":[30,85,92,100,118],"as":[32],"statistical":[34],"estimation":[35],"problem,":[36],"showing":[37],"that":[38,114,122],"metric":[39],"reliability":[40],"fundamentally":[42],"limited":[43],"by":[44,59],"evaluation-set":[46],"construction.":[47],"further":[49],"\\emph{semantic":[51],"stratification},":[52],"grounds":[54],"corpus":[57],"structure":[58],"organizing":[60],"documents":[61],"into":[62,91],"an":[63],"interpretable":[64,89],"global":[65],"space":[66],"of":[67],"entity-based":[68],"clusters":[69],"systematically":[71],"generating":[72],"queries":[73],"missing":[75],"strata.":[76],"This":[77],"yields":[78,125],"(1)":[79],"formal":[80],"semantic":[81],"coverage":[82,109],"guarantees":[83],"across":[84,96],"regimes":[86],"(2)":[88],"visibility":[90],"failure":[93],"modes.":[94],"Experiments":[95],"multiple":[97],"benchmarks":[98],"methods":[101],"validate":[102],"our":[103],"framework.":[104],"The":[105],"results":[106],"expose":[107],"systematic":[108],"gaps,":[110],"identify":[111],"structural":[112],"signals":[113],"explain":[115],"variance":[116],"performance,":[119],"show":[121],"stratified":[123],"more":[126,133],"stable":[127],"transparent":[129],"assessments":[130],"while":[131],"supporting":[132],"trustworthy":[134],"decision-making":[135],"than":[136],"aggregate":[137],"metrics.":[138]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-24T00:00:00"}
