{"id":"https://openalex.org/W4412378025","doi":"https://doi.org/10.1145/3726302.3730229","title":"Measuring Hypothesis Testing Errors in the Evaluation of Retrieval Systems","display_name":"Measuring Hypothesis Testing Errors in the Evaluation of Retrieval Systems","publication_year":2025,"publication_date":"2025-07-13","ids":{"openalex":"https://openalex.org/W4412378025","doi":"https://doi.org/10.1145/3726302.3730229"},"language":"en","primary_location":{"id":"doi:10.1145/3726302.3730229","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3726302.3730229","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3726302.3730229","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3726302.3730229","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034848421","display_name":"Jack McKechnie","orcid":"https://orcid.org/0000-0002-1509-9248"},"institutions":[{"id":"https://openalex.org/I7882870","display_name":"University of Glasgow","ror":"https://ror.org/00vtgdb53","country_code":"GB","type":"education","lineage":["https://openalex.org/I7882870"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Jack McKechnie","raw_affiliation_strings":["University of Glasgow, Glasgow, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University of Glasgow, Glasgow, United Kingdom","institution_ids":["https://openalex.org/I7882870"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063436649","display_name":"Graham McDonald","orcid":"https://orcid.org/0000-0002-1266-5996"},"institutions":[{"id":"https://openalex.org/I7882870","display_name":"University of Glasgow","ror":"https://ror.org/00vtgdb53","country_code":"GB","type":"education","lineage":["https://openalex.org/I7882870"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Graham McDonald","raw_affiliation_strings":["University of Glasgow, Glasgow, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University of Glasgow, Glasgow, United Kingdom","institution_ids":["https://openalex.org/I7882870"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057643560","display_name":"Craig Macdonald","orcid":"https://orcid.org/0000-0003-3143-279X"},"institutions":[{"id":"https://openalex.org/I7882870","display_name":"University of Glasgow","ror":"https://ror.org/00vtgdb53","country_code":"GB","type":"education","lineage":["https://openalex.org/I7882870"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Craig Macdonald","raw_affiliation_strings":["University of Glasgow, Glasgow, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University of Glasgow, Glasgow, United Kingdom","institution_ids":["https://openalex.org/I7882870"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5034848421"],"corresponding_institution_ids":["https://openalex.org/I7882870"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.08360776,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2875","last_page":"2879"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6878921389579773},{"id":"https://openalex.org/keywords/system-testing","display_name":"System testing","score":0.4713403880596161},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.45546847581863403},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.41927242279052734},{"id":"https://openalex.org/keywords/software-engineering","display_name":"Software engineering","score":0.17178845405578613},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.11747804284095764}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6878921389579773},{"id":"https://openalex.org/C7166840","wikidata":"https://www.wikidata.org/wiki/Q1199682","display_name":"System testing","level":2,"score":0.4713403880596161},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.45546847581863403},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.41927242279052734},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.17178845405578613},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.11747804284095764}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3726302.3730229","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3726302.3730229","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3726302.3730229","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},{"id":"pmh:oai:eprints.gla.ac.uk:352839","is_oa":true,"landing_page_url":"https://eprints.gla.ac.uk/view/author/70479.html>","pdf_url":null,"source":{"id":"https://openalex.org/S4210235606","display_name":"ENLIGHTEN (Jurnal Bimbingan dan Konseling Islam)","issn_l":"2622-8912","issn":["2622-8912","2622-8920"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":null,"raw_type":"PeerReviewed"}],"best_oa_location":{"id":"doi:10.1145/3726302.3730229","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3726302.3730229","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3726302.3730229","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412378025.pdf","grobid_xml":"https://content.openalex.org/works/W4412378025.grobid-xml"},"referenced_works_count":38,"referenced_works":["https://openalex.org/W1857390120","https://openalex.org/W1965690069","https://openalex.org/W1983595289","https://openalex.org/W1985514943","https://openalex.org/W1989373893","https://openalex.org/W1994960885","https://openalex.org/W2007671364","https://openalex.org/W2023217403","https://openalex.org/W2053154970","https://openalex.org/W2057495142","https://openalex.org/W2075893676","https://openalex.org/W2078488685","https://openalex.org/W2100240666","https://openalex.org/W2101807845","https://openalex.org/W2102661342","https://openalex.org/W2109553965","https://openalex.org/W2135985057","https://openalex.org/W2160471965","https://openalex.org/W2161417756","https://openalex.org/W2336806308","https://openalex.org/W2463093044","https://openalex.org/W2742156913","https://openalex.org/W2794432940","https://openalex.org/W2796677525","https://openalex.org/W2923053417","https://openalex.org/W2953438995","https://openalex.org/W2975013005","https://openalex.org/W2998679789","https://openalex.org/W3088694469","https://openalex.org/W3146187623","https://openalex.org/W4292402161","https://openalex.org/W4384107234","https://openalex.org/W4384648390","https://openalex.org/W4385688511","https://openalex.org/W4386043577","https://openalex.org/W4400526908","https://openalex.org/W4409167404","https://openalex.org/W6602744131"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W133258754"],"abstract_inverted_index":{"The":[0],"evaluation":[1],"of":[2,38,82,90,92,163,194,217],"Information":[3],"Retrieval":[4],"(IR)":[5],"systems":[6,93],"typically":[7],"uses":[8],"query-document":[9],"pairs":[10,91],"with":[11],"corresponding":[12],"human-labelled":[13],"relevance":[14,40,47,173],"assessments":[15,41],"(qrels).":[16],"These":[17],"qrels":[18,56,169,195],"are":[19,95],"used":[20,157,211],"to":[21,57,66,111,115,158,176,212],"determine":[22],"if":[23],"one":[24],"system":[25],"is":[26,42,73,130],"better":[27],"than":[28],"another":[29],"based":[30],"on":[31,79],"average":[32],"retrieval":[33],"performance.":[34],"Acquiring":[35],"large":[36],"volumes":[37],"human":[39],"expensive.":[43],"Therefore,":[44],"more":[45],"efficient":[46],"assessment":[48,174],"approaches":[49],"have":[50],"been":[51],"proposed,":[52],"necessitating":[53],"comparisons":[54],"between":[55,71],"ascertain":[58],"their":[59],"efficacy.":[60],"Discriminative":[61],"power,":[62],"i.e.":[63],"the":[64,80,88,137,160,191],"ability":[65],"correctly":[67],"identify":[68],"significant":[69],"differences":[70],"systems,":[72],"important":[74,131],"for":[75],"drawing":[76],"accurate":[77],"conclusions":[78,113],"robustness":[81],"qrels.":[83,164],"Previous":[84],"work":[85],"has":[86,101],"measured":[87],"proportion":[89],"that":[94,122,147,187,205],"identified":[96],"as":[97,132,152],"significantly":[98],"different":[99],"and":[100,145,204],"quantified":[102],"Type":[103,107,125,142,201],"I":[104,108],"statistical":[105],"errors.":[106],"errors":[109,127,144,181],"lead":[110,134],"incorrect":[112],"due":[114],"false":[116],"positive":[117],"significance":[118],"tests.":[119],"We":[120,140,165,185],"argue":[121],"also":[123],"identifying":[124],"II":[126,143,202],"(false":[128],"negatives)":[129],"they":[133],"science":[135],"in":[136,182,220],"wrong":[138],"direction.":[139],"quantify":[141],"propose":[146],"balanced":[148,153,206],"classification":[149,207],"metrics,":[150],"such":[151],"accuracy,":[154],"can":[155,196,209],"be":[156,197,210],"portray":[159],"discriminative":[161,192,218],"power":[162,193,219],"perform":[166],"experiments":[167],"using":[168,171],"generated":[170],"alternative":[172],"methods":[175],"investigate":[177],"measuring":[178],"hypothesis":[179],"testing":[180],"IR":[183],"evaluation.":[184],"find":[186],"additional":[188],"insights":[189],"into":[190],"gained":[198],"by":[199],"quantifying":[200],"errors,":[203],"metrics":[208],"give":[213],"an":[214],"overall":[215],"summary":[216],"one,":[221],"easily":[222],"comparable,":[223],"number.":[224]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
