{"id":"https://openalex.org/W7148504962","doi":"https://doi.org/10.48550/arxiv.2604.00024","title":"WHBench: Evaluating Frontier LLMs with Expert-in-the-Loop Validation on Women's Health Topics","display_name":"WHBench: Evaluating Frontier LLMs with Expert-in-the-Loop Validation on Women's Health Topics","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7148504962","doi":"https://doi.org/10.48550/arxiv.2604.00024"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00024","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00024","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00024","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132783862","display_name":"Sneha Maurya","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Maurya, Sneha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132790077","display_name":"Pragya Saboo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saboo, Pragya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132798473","display_name":"Girish Kumar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kumar, Girish","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5132783862"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.362199991941452,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.362199991941452,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.16339999437332153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11249999701976776,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6636000275611877},{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.6635000109672546},{"id":"https://openalex.org/keywords/guideline","display_name":"Guideline","score":0.5576000213623047},{"id":"https://openalex.org/keywords/harm","display_name":"Harm","score":0.5476999878883362},{"id":"https://openalex.org/keywords/safer","display_name":"SAFER","score":0.5462999939918518},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5213000178337097},{"id":"https://openalex.org/keywords/toolbox","display_name":"Toolbox","score":0.39910000562667847},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.39419999718666077},{"id":"https://openalex.org/keywords/risk-assessment","display_name":"Risk assessment","score":0.3880999982357025}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6636000275611877},{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.6635000109672546},{"id":"https://openalex.org/C2780182762","wikidata":"https://www.wikidata.org/wiki/Q1630279","display_name":"Guideline","level":2,"score":0.5576000213623047},{"id":"https://openalex.org/C2777363581","wikidata":"https://www.wikidata.org/wiki/Q15098235","display_name":"Harm","level":2,"score":0.5476999878883362},{"id":"https://openalex.org/C2776654903","wikidata":"https://www.wikidata.org/wiki/Q2601463","display_name":"SAFER","level":2,"score":0.5462999939918518},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5213000178337097},{"id":"https://openalex.org/C162118730","wikidata":"https://www.wikidata.org/wiki/Q1128453","display_name":"Actuarial science","level":1,"score":0.48249998688697815},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.46619999408721924},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4512999951839447},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.40059998631477356},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.39910000562667847},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.39419999718666077},{"id":"https://openalex.org/C12174686","wikidata":"https://www.wikidata.org/wiki/Q1058438","display_name":"Risk assessment","level":2,"score":0.3880999982357025},{"id":"https://openalex.org/C138816342","wikidata":"https://www.wikidata.org/wiki/Q189603","display_name":"Public health","level":2,"score":0.37279999256134033},{"id":"https://openalex.org/C184356942","wikidata":"https://www.wikidata.org/wiki/Q830382","display_name":"Best practice","level":2,"score":0.33340001106262207},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.319599986076355},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.3133000135421753},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.3034999966621399},{"id":"https://openalex.org/C535046627","wikidata":"https://www.wikidata.org/wiki/Q30612","display_name":"Clinical trial","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.30140000581741333},{"id":"https://openalex.org/C47344431","wikidata":"https://www.wikidata.org/wiki/Q1519812","display_name":"Health policy","level":3,"score":0.3001999855041504},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.2992999851703644},{"id":"https://openalex.org/C157085824","wikidata":"https://www.wikidata.org/wiki/Q2384809","display_name":"TRIPS architecture","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C2779473830","wikidata":"https://www.wikidata.org/wiki/Q1540899","display_name":"MEDLINE","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C168725872","wikidata":"https://www.wikidata.org/wiki/Q991663","display_name":"Sophistication","level":2,"score":0.29120001196861267},{"id":"https://openalex.org/C187155963","wikidata":"https://www.wikidata.org/wiki/Q629029","display_name":"Occupational safety and health","level":2,"score":0.2808000147342682},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.263700008392334},{"id":"https://openalex.org/C2778149918","wikidata":"https://www.wikidata.org/wiki/Q3291156","display_name":"Population health","level":3,"score":0.26179999113082886},{"id":"https://openalex.org/C2780877353","wikidata":"https://www.wikidata.org/wiki/Q2518253","display_name":"Health services research","level":3,"score":0.26100000739097595},{"id":"https://openalex.org/C2780456720","wikidata":"https://www.wikidata.org/wiki/Q1135011","display_name":"Health impact assessment","level":3,"score":0.259799987077713},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C99454951","wikidata":"https://www.wikidata.org/wiki/Q932068","display_name":"Environmental health","level":1,"score":0.25110000371932983},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.25049999356269836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00024","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00024","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00024","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00024","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Gender equality","score":0.451599657535553,"id":"https://metadata.un.org/sdg/5"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,58,106],"are":[3],"increasingly":[4],"used":[5],"for":[6,129,135,143],"medical":[7],"guidance,":[8],"but":[9,127],"women's":[10,34],"health":[11,35,164],"remains":[12],"under-evaluated":[13],"in":[14,115,146,162],"benchmark":[15,154],"design.":[16],"We":[17,55],"present":[18],"the":[19,98,123,141],"Women's":[20],"Health":[21],"Benchmark":[22],"(WHBench),":[23],"a":[24,60,151],"targeted":[25],"evaluation":[26,138],"suite":[27],"of":[28],"47":[29],"expert-crafted":[30],"scenarios":[31],"across":[32],"10":[33],"topics,":[36],"designed":[37],"to":[38,155],"expose":[39],"clinically":[40],"meaningful":[41],"failure":[42],"modes":[43],"including":[44],"outdated":[45],"guidelines,":[46],"unsafe":[47],"omissions,":[48],"dosing":[49],"errors,":[50],"and":[51,75,81,112,158],"equity-related":[52],"blind":[53],"spots.":[54],"evaluate":[56],"22":[57],"using":[59],"23-criterion":[61],"rubric":[62],"spanning":[63],"clinical":[64,147],"accuracy,":[65],"completeness,":[66],"safety,":[67],"communication":[68],"quality,":[69],"instruction":[70],"following,":[71],"equity,":[72],"uncertainty":[73],"handling,":[74],"guideline":[76],"adherence,":[77],"with":[78],"safety-weighted":[79],"penalties":[80],"server-side":[82],"score":[83],"recalculation.":[84],"Across":[85],"3,102":[86],"attempted":[87],"responses":[88],"(3,100":[89],"scored),":[90],"no":[91],"model":[92,100,130],"mean":[93],"performance":[94],"exceeds":[95],"75":[96],"percent;":[97],"best":[99],"reaches":[101],"72.1":[102],"percent.":[103],"Even":[104],"top":[105],"show":[107],"low":[108],"fully":[109],"correct":[110],"rates":[111],"substantial":[113],"variation":[114],"harm":[116],"rates.":[117],"Inter-rater":[118],"reliability":[119],"is":[120],"moderate":[121],"at":[122],"response":[124],"label":[125],"level":[126],"high":[128],"ranking,":[131],"supporting":[132],"WHBench":[133,149],"utility":[134],"comparative":[136],"system":[137],"while":[139],"highlighting":[140],"need":[142],"expert":[144],"oversight":[145],"deployment.":[148],"provides":[150],"public,":[152],"failure-mode-aware":[153],"track":[156],"safer":[157],"more":[159],"equitable":[160],"progress":[161],"womens":[163],"AI.":[165]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
