{"id":"https://openalex.org/W4416037066","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.143","title":"MedHallu: A Comprehensive Benchmark for Detecting Medical Hallucinations in Large Language Models","display_name":"MedHallu: A Comprehensive Benchmark for Detecting Medical Hallucinations in Large Language Models","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416037066","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.143"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.143","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.143","pdf_url":"https://aclanthology.org/2025.emnlp-main.143.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.143.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033179563","display_name":"Shrey Pandit","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shrey Pandit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009642125","display_name":"Jiawei Xu","orcid":"https://orcid.org/0000-0002-4212-3872"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiawei Xu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071064449","display_name":"Junyuan Hong","orcid":"https://orcid.org/0000-0002-5718-5187"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junyuan Hong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048522863","display_name":"Zhangyang Wang","orcid":"https://orcid.org/0000-0002-2050-5693"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhangyang Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103073431","display_name":"Tianlong Chen","orcid":"https://orcid.org/0000-0001-7774-8197"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tianlong Chen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102775611","display_name":"Kaidi Xu","orcid":"https://orcid.org/0000-0003-4437-0671"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaidi Xu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5047170063","display_name":"Ying Ding","orcid":"https://orcid.org/0000-0003-2567-2009"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ying Ding","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5033179563"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":15.9773,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":{"value":0.99119482,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"2858","last_page":"2873"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12488","display_name":"Mental Health via Writing","score":0.21469999849796295,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12488","display_name":"Mental Health via Writing","score":0.21469999849796295,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.08860000222921371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12553","display_name":"Psychedelics and Drug Studies","score":0.057999998331069946,"subfield":{"id":"https://openalex.org/subfields/3203","display_name":"Clinical Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5357000231742859},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3792000114917755},{"id":"https://openalex.org/keywords/language-understanding","display_name":"Language understanding","score":0.2825999855995178},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.26930001378059387}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6039999723434448},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5444999933242798},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5357000231742859},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.47450000047683716},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3792000114917755},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.35030001401901245},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34610000252723694},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.24629999697208405}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.143","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.143","pdf_url":"https://aclanthology.org/2025.emnlp-main.143.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.143","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.143","pdf_url":"https://aclanthology.org/2025.emnlp-main.143.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416037066.pdf","grobid_xml":"https://content.openalex.org/works/W4416037066.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Advancements":[0],"in":[1,10,22],"Large":[2],"Language":[3],"Models":[4],"(LLMs)":[5],"and":[6,42,86,136,151],"their":[7,17],"increasing":[8],"use":[9],"medical":[11,33,58],"question-answering":[12],"necessitate":[13],"rigorous":[14],"evaluation":[15],"of":[16,51,144],"reliability.A":[18],"critical":[19],"challenge":[20],"lies":[21],"hallucination,":[23],"where":[24],"models":[25],"generate":[26],"plausible":[27],"yet":[28],"factually":[29],"incorrect":[30],"outputs.In":[31],"the":[32,52,87,99,145,149],"domain,":[34],"this":[35,93],"poses":[36],"serious":[37],"risks":[38],"to":[39,126,156,159],"patient":[40],"safety":[41],"clinical":[43],"decision-making.To":[44],"address":[45],"this,":[46],"we":[47,118,130],"introduce":[48],"MedHallu,":[49],"one":[50,143],"first":[53],"benchmark":[54],"specifically":[55],"designed":[56],"for":[57,110],"hallucination":[59,95],"detection.Med-Hallu":[60],"comprises":[61],"10,000":[62],"high-quality":[63],"questionanswer":[64],"pairs":[65],"derived":[66],"from":[67],"PubMedQA,":[68],"with":[69,92,98],"hallucinated":[70],"answers":[71],"systematically":[72],"generated":[73],"through":[74],"a":[75,138],"controlled":[76],"pipeline.Our":[77],"experiments":[78],"show":[79,119,132],"that":[80,120],"state-of-the-art":[81],"LLMs,":[82],"including":[83],"GPT-4o,":[84],"Llama-3.1,":[85],"medically":[88],"fine-tuned":[89],"UltraMedical,":[90],"struggle":[91],"binary":[94],"detection":[96],"task,":[97],"best":[100],"model":[101],"achieving":[102],"an":[103],"F1":[104,152],"score":[105],"as":[106,108,142],"low":[107],"0.625":[109],"detecting":[111],"\"hard\"":[112],"category":[113,141],"hallucinations.Using":[114],"bidirectional":[115],"entailment":[116],"clustering,":[117],"harder-to-detect":[121],"hallucinations":[122],"are":[123],"semantically":[124],"closer":[125],"ground":[127],"truth.Through":[128],"experiments,":[129],"also":[131],"incorporating":[133],"domainspecific":[134],"knowledge":[135],"introducing":[137],"\"not":[139],"sure\"":[140],"answer":[146],"categories":[147],"improves":[148],"precision":[150],"scores":[153],"by":[154],"up":[155],"38%":[157],"relative":[158],"baselines.":[160]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":5}],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-11-08T00:00:00"}
