{"id":"https://openalex.org/W4416036814","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.548","title":"From Scores to Steps: Diagnosing and Improving LLM Performance in Evidence-Based Medical Calculations","display_name":"From Scores to Steps: Diagnosing and Improving LLM Performance in Evidence-Based Medical Calculations","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416036814","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.548","pmid":"https://pubmed.ncbi.nlm.nih.gov/41799784"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.548","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.548","pdf_url":"https://aclanthology.org/2025.emnlp-main.548.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.548.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5022400870","display_name":"Benlu Wang","orcid":"https://orcid.org/0009-0009-6812-0256"},"institutions":[{"id":"https://openalex.org/I32971472","display_name":"Yale University","ror":"https://ror.org/03v76x132","country_code":"US","type":"education","lineage":["https://openalex.org/I32971472"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Benlu Wang","raw_affiliation_strings":["Department of Computer Science, Yale University, CT, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Yale University, CT, USA","institution_ids":["https://openalex.org/I32971472"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120025196","display_name":"Iris Xia","orcid":null},"institutions":[{"id":"https://openalex.org/I32971472","display_name":"Yale University","ror":"https://ror.org/03v76x132","country_code":"US","type":"education","lineage":["https://openalex.org/I32971472"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Iris Xia","raw_affiliation_strings":["Department of Computer Science, Yale University, CT, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Yale University, CT, USA","institution_ids":["https://openalex.org/I32971472"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100376961","display_name":"Yifan Zhang","orcid":"https://orcid.org/0000-0003-1298-5436"},"institutions":[{"id":"https://openalex.org/I133738476","display_name":"University of Massachusetts Lowell","ror":"https://ror.org/03hamhx47","country_code":"US","type":"education","lineage":["https://openalex.org/I133738476"]},{"id":"https://openalex.org/I4210090056","display_name":"Bedford VA Research Corporation","ror":"https://ror.org/00caks861","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210090056"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yifan Zhang","raw_affiliation_strings":["Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","Miner School of Computer and Information Sciences, UMass Lowell, MA, USA"],"affiliations":[{"raw_affiliation_string":"Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","institution_ids":["https://openalex.org/I4210090056"]},{"raw_affiliation_string":"Miner School of Computer and Information Sciences, UMass Lowell, MA, USA","institution_ids":["https://openalex.org/I133738476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068755602","display_name":"Junda Wang","orcid":"https://orcid.org/0009-0005-9998-7275"},"institutions":[{"id":"https://openalex.org/I177605424","display_name":"Amherst College","ror":"https://ror.org/028vqfs63","country_code":"US","type":"education","lineage":["https://openalex.org/I177605424"]},{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]},{"id":"https://openalex.org/I4210090056","display_name":"Bedford VA Research Corporation","ror":"https://ror.org/00caks861","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210090056"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Junda Wang","raw_affiliation_strings":["Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","Manning College of Information and Computer Sciences, UMass Amherst, MA, USA"],"affiliations":[{"raw_affiliation_string":"Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","institution_ids":["https://openalex.org/I4210090056"]},{"raw_affiliation_string":"Manning College of Information and Computer Sciences, UMass Amherst, MA, USA","institution_ids":["https://openalex.org/I177605424","https://openalex.org/I24603500"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093996581","display_name":"Feiyun Ouyang","orcid":"https://orcid.org/0000-0002-7061-7351"},"institutions":[{"id":"https://openalex.org/I133738476","display_name":"University of Massachusetts Lowell","ror":"https://ror.org/03hamhx47","country_code":"US","type":"education","lineage":["https://openalex.org/I133738476"]},{"id":"https://openalex.org/I4210090056","display_name":"Bedford VA Research Corporation","ror":"https://ror.org/00caks861","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210090056"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Feiyun Ouyang","raw_affiliation_strings":["Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","Miner School of Computer and Information Sciences, UMass Lowell, MA, USA"],"affiliations":[{"raw_affiliation_string":"Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","institution_ids":["https://openalex.org/I4210090056"]},{"raw_affiliation_string":"Miner School of Computer and Information Sciences, UMass Lowell, MA, USA","institution_ids":["https://openalex.org/I133738476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100649312","display_name":"S. Han","orcid":"https://orcid.org/0000-0001-8383-7348"},"institutions":[{"id":"https://openalex.org/I133738476","display_name":"University of Massachusetts Lowell","ror":"https://ror.org/03hamhx47","country_code":"US","type":"education","lineage":["https://openalex.org/I133738476"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shuo Han","raw_affiliation_strings":["Miner School of Computer and Information Sciences, UMass Lowell, MA, USA"],"affiliations":[{"raw_affiliation_string":"Miner School of Computer and Information Sciences, UMass Lowell, MA, USA","institution_ids":["https://openalex.org/I133738476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064858748","display_name":"Arman Cohan","orcid":"https://orcid.org/0000-0002-8954-2724"},"institutions":[{"id":"https://openalex.org/I32971472","display_name":"Yale University","ror":"https://ror.org/03v76x132","country_code":"US","type":"education","lineage":["https://openalex.org/I32971472"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Arman Cohan","raw_affiliation_strings":["Department of Computer Science, Yale University, CT, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Yale University, CT, USA","institution_ids":["https://openalex.org/I32971472"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101507232","display_name":"Hongfang Yu","orcid":"https://orcid.org/0000-0002-5219-1780"},"institutions":[{"id":"https://openalex.org/I133738476","display_name":"University of Massachusetts Lowell","ror":"https://ror.org/03hamhx47","country_code":"US","type":"education","lineage":["https://openalex.org/I133738476"]},{"id":"https://openalex.org/I177605424","display_name":"Amherst College","ror":"https://ror.org/028vqfs63","country_code":"US","type":"education","lineage":["https://openalex.org/I177605424"]},{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]},{"id":"https://openalex.org/I4210090056","display_name":"Bedford VA Research Corporation","ror":"https://ror.org/00caks861","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210090056"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hong Yu","raw_affiliation_strings":["Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","Manning College of Information and Computer Sciences, UMass Amherst, MA, USA","Miner School of Computer and Information Sciences, UMass Lowell, MA, USA"],"affiliations":[{"raw_affiliation_string":"Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","institution_ids":["https://openalex.org/I4210090056"]},{"raw_affiliation_string":"Manning College of Information and Computer Sciences, UMass Amherst, MA, USA","institution_ids":["https://openalex.org/I177605424","https://openalex.org/I24603500"]},{"raw_affiliation_string":"Miner School of Computer and Information Sciences, UMass Lowell, MA, USA","institution_ids":["https://openalex.org/I133738476"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5012790041","display_name":"Zonghai Yao","orcid":null},"institutions":[{"id":"https://openalex.org/I177605424","display_name":"Amherst College","ror":"https://ror.org/028vqfs63","country_code":"US","type":"education","lineage":["https://openalex.org/I177605424"]},{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]},{"id":"https://openalex.org/I4210090056","display_name":"Bedford VA Research Corporation","ror":"https://ror.org/00caks861","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210090056"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zonghai Yao","raw_affiliation_strings":["Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","Manning College of Information and Computer Sciences, UMass Amherst, MA, USA"],"affiliations":[{"raw_affiliation_string":"Center for Healthcare Organization and Implementation Research, VA Bedford Health Care","institution_ids":["https://openalex.org/I4210090056"]},{"raw_affiliation_string":"Manning College of Information and Computer Sciences, UMass Amherst, MA, USA","institution_ids":["https://openalex.org/I177605424","https://openalex.org/I24603500"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5022400870"],"corresponding_institution_ids":["https://openalex.org/I32971472"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.38630671,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"2025","issue":null,"first_page":"10820","last_page":"10844"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12574","display_name":"Clinical Reasoning and Diagnostic Skills","score":0.049800001084804535,"subfield":{"id":"https://openalex.org/subfields/2714","display_name":"Family Practice"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T12574","display_name":"Clinical Reasoning and Diagnostic Skills","score":0.049800001084804535,"subfield":{"id":"https://openalex.org/subfields/2714","display_name":"Family Practice"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T14400","display_name":"Medical Coding and Health Information","score":0.04360000044107437,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T12790","display_name":"Nursing Diagnosis and Documentation","score":0.03709999844431877,"subfield":{"id":"https://openalex.org/subfields/2910","display_name":"Issues, ethics and legal aspects"},"field":{"id":"https://openalex.org/fields/29","display_name":"Nursing"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/medline","display_name":"MEDLINE","score":0.2802000045776367},{"id":"https://openalex.org/keywords/medical-imaging","display_name":"Medical imaging","score":0.24549999833106995},{"id":"https://openalex.org/keywords/medical-diagnosis","display_name":"Medical diagnosis","score":0.23350000381469727},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.2273000031709671},{"id":"https://openalex.org/keywords/data-collection","display_name":"Data collection","score":0.2102999985218048}],"concepts":[{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.4212999939918518},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.37860000133514404},{"id":"https://openalex.org/C19527891","wikidata":"https://www.wikidata.org/wiki/Q1120908","display_name":"Medical physics","level":1,"score":0.3635999858379364},{"id":"https://openalex.org/C2779473830","wikidata":"https://www.wikidata.org/wiki/Q1540899","display_name":"MEDLINE","level":2,"score":0.2802000045776367},{"id":"https://openalex.org/C31601959","wikidata":"https://www.wikidata.org/wiki/Q931309","display_name":"Medical imaging","level":2,"score":0.24549999833106995},{"id":"https://openalex.org/C534262118","wikidata":"https://www.wikidata.org/wiki/Q177719","display_name":"Medical diagnosis","level":2,"score":0.23350000381469727},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2273000031709671},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.21539999544620514},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.2102999985218048},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.20550000667572021}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.548","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.548","pdf_url":"https://aclanthology.org/2025.emnlp-main.548.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},{"id":"pmid:41799784","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41799784","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Conference on Empirical Methods in Natural Language Processing. Conference on Empirical Methods in Natural Language Processing","raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:12961587","is_oa":true,"landing_page_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC12961587/","pdf_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC12961587/pdf/nihms-2140062.pdf","source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Proc Conf Empir Methods Nat Lang Process","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.548","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.548","pdf_url":"https://aclanthology.org/2025.emnlp-main.548.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320334151","display_name":"Center for Healthcare Organization and Implementation Research","ror":null}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416036814.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"have":[4],"demonstrated":[5],"promising":[6],"performance":[7],"on":[8,64],"medical":[9,16,57,204],"benchmarks;":[10],"however,":[11],"their":[12],"ability":[13],"to":[14,103,169,197],"perform":[15],"calculations,":[17],"a":[18,38,61,77,143,182],"crucial":[19],"aspect":[20],"of":[21,98,163,176],"clinical":[22,50,65],"decision-making,":[23],"remains":[24],"underexplored":[25],"and":[26,46,70,75,89,137,152,180,190],"poorly":[27],"evaluated.":[28],"Existing":[29],"benchmarks":[30],"often":[31],"assess":[32],"only":[33],"the":[34,72,96,161,174],"final":[35],"answer":[36],"with":[37,60,132],"wide":[39],"numerical":[40],"tolerance,":[41],"overlooking":[42],"systematic":[43],"reasoning":[44,192],"failures":[45],"potentially":[47],"causing":[48],"serious":[49],"misjudgments.":[51],"In":[52],"this":[53,93],"work,":[54],"we":[55,68,112,141,194],"revisit":[56],"calculation":[58],"evaluation":[59,80,128],"stronger":[62],"focus":[63],"trustworthiness.":[66],"First,":[67],"clean":[69],"restructure":[71],"MedCalc-Bench":[73],"dataset":[74],"propose":[76,142],"new":[78],"step-by-step":[79],"pipeline":[81],"that":[82,119,148],"independently":[83],"assesses":[84],"formula":[85],"selection,":[86],"entity":[87],"extraction,":[88],"arithmetic":[90],"computation.":[91],"Under":[92],"granular":[94],"framework,":[95],"accuracy":[97,162],"GPT-4o":[99],"drops":[100],"from":[101,166],"62.7%":[102],"43.6%,":[104],"revealing":[105],"errors":[106],"masked":[107],"by":[108],"prior":[109],"evaluations.":[110],"Second,":[111],"introduce":[113],"an":[114],"automatic":[115],"error":[116],"analysis":[117],"framework":[118],"generates":[120],"structured":[121],"attribution":[122],"for":[123,202],"each":[124],"failure":[125],"mode.":[126],"Human":[127],"confirms":[129],"its":[130],"alignment":[131],"expert":[133],"judgment,":[134],"enabling":[135,188],"scalable":[136],"explainable":[138],"diagnostics.":[139],"Finally,":[140],"modular":[144],"agentic":[145],"pipeline,":[146],"MedRaC,":[147],"combines":[149],"retrieval-augmented":[150],"generation":[151],"Python-based":[153],"code":[154],"execution.":[155],"Without":[156],"any":[157],"fine-tuning,":[158],"MedRaC":[159],"improves":[160],"different":[164],"LLMs":[165],"16.35%":[167],"up":[168],"53.19%.":[170],"Our":[171],"work":[172],"highlights":[173],"limitations":[175],"current":[177],"benchmark":[178],"practices":[179],"proposes":[181],"more":[183],"clinically":[184],"faithful":[185],"methodology.":[186],"By":[187],"transparent":[189],"transferable":[191],"evaluation,":[193],"move":[195],"closer":[196],"making":[198],"LLM-based":[199],"systems":[200],"trustworthy":[201],"real-world":[203],"applications.":[205]},"counts_by_year":[],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-11-08T00:00:00"}
