{"id":"https://openalex.org/W7154218170","doi":"https://doi.org/10.48550/arxiv.2604.11133","title":"How Robust Are Large Language Models for Clinical Numeracy? An Empirical Study on Numerical Reasoning Abilities in Clinical Contexts","display_name":"How Robust Are Large Language Models for Clinical Numeracy? An Empirical Study on Numerical Reasoning Abilities in Clinical Contexts","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154218170","doi":"https://doi.org/10.48550/arxiv.2604.11133"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11133","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11133","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11133","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5104021102","display_name":"Minh-Vuong Nguyen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Minh-Vuong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006717459","display_name":"Fatemeh Shiri","orcid":"https://orcid.org/0000-0001-8752-2132"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shiri, Fatemeh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133603734","display_name":"Zhuang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zhuang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133553937","display_name":"Karin Verspoor","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Verspoor, Karin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.4431000053882599,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.4431000053882599,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.26809999346733093,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.1371999979019165,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5824999809265137},{"id":"https://openalex.org/keywords/numeracy","display_name":"Numeracy","score":0.5690000057220459},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.5559999942779541},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5343999862670898},{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.43869999051094055},{"id":"https://openalex.org/keywords/relational-database","display_name":"Relational database","score":0.4262999892234802},{"id":"https://openalex.org/keywords/knowledge-base","display_name":"Knowledge base","score":0.40560001134872437},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.337799996137619},{"id":"https://openalex.org/keywords/missing-data","display_name":"Missing data","score":0.33739998936653137},{"id":"https://openalex.org/keywords/value","display_name":"Value (mathematics)","score":0.3352999985218048}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6542999744415283},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5824999809265137},{"id":"https://openalex.org/C53537400","wikidata":"https://www.wikidata.org/wiki/Q140637","display_name":"Numeracy","level":3,"score":0.5690000057220459},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.5559999942779541},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5343999862670898},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.43869999051094055},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.4262999892234802},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42289999127388},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.40560001134872437},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39149999618530273},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.337799996137619},{"id":"https://openalex.org/C9357733","wikidata":"https://www.wikidata.org/wiki/Q6878417","display_name":"Missing data","level":2,"score":0.33739998936653137},{"id":"https://openalex.org/C2776291640","wikidata":"https://www.wikidata.org/wiki/Q2912517","display_name":"Value (mathematics)","level":2,"score":0.3352999985218048},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C46274116","wikidata":"https://www.wikidata.org/wiki/Q185521","display_name":"Truth value","level":2,"score":0.32269999384880066},{"id":"https://openalex.org/C40207289","wikidata":"https://www.wikidata.org/wiki/Q755662","display_name":"Relational model","level":3,"score":0.31839999556541443},{"id":"https://openalex.org/C33762810","wikidata":"https://www.wikidata.org/wiki/Q461671","display_name":"Data integrity","level":2,"score":0.3179999887943268},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3174999952316284},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.31690001487731934},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.31040000915527344},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.30730000138282776},{"id":"https://openalex.org/C21200559","wikidata":"https://www.wikidata.org/wiki/Q7451068","display_name":"Sensitivity (control systems)","level":2,"score":0.3057999908924103},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2994000017642975},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C2986605239","wikidata":"https://www.wikidata.org/wiki/Q925667","display_name":"Numerical models","level":3,"score":0.29280000925064087},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C55037315","wikidata":"https://www.wikidata.org/wiki/Q5421151","display_name":"Experimental data","level":2,"score":0.28760001063346863},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2849000096321106},{"id":"https://openalex.org/C3020452639","wikidata":"https://www.wikidata.org/wiki/Q454812","display_name":"Clinical judgment","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C2779974597","wikidata":"https://www.wikidata.org/wiki/Q28448986","display_name":"Clinical Practice","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.26440000534057617},{"id":"https://openalex.org/C192028432","wikidata":"https://www.wikidata.org/wiki/Q845739","display_name":"Query language","level":2,"score":0.26030001044273376},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.25850000977516174}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11133","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11133","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11133","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11133","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.8713279366493225,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"are":[4,187],"increasingly":[5],"being":[6],"explored":[7],"for":[8,33,178],"clinical":[9,27,34,55,76],"question":[10,116],"answering":[11],"and":[12,46,84,111,138,162,184],"decision":[13],"support,":[14],"yet":[15],"safe":[16],"deployment":[17],"critically":[18],"requires":[19],"reliable":[20,180],"handling":[21],"of":[22,31,51,63,75],"patient":[23],"measurements":[24],"in":[25,95],"heterogeneous":[26],"notes.":[28],"Existing":[29],"evaluations":[30],"LLMs":[32,121],"numerical":[35,52,181],"reasoning":[36],"provide":[37],"limited":[38],"operation-level":[39],"coverage,":[40],"restricted":[41],"primarily":[42],"to":[43,156,171],"arithmetic":[44,80],"computation,":[45,81],"rarely":[47],"assess":[48],"the":[49,107],"robustness":[50],"understanding":[53],"across":[54],"note":[56],"formats.":[57],"We":[58],"introduce":[59],"ClinicNumRobBench,":[60],"a":[61,101,175],"benchmark":[62],"1,624":[64],"context-question":[65],"instances":[66],"with":[67,129,142],"ground-truth":[68],"answers":[69],"that":[70,123],"evaluates":[71],"four":[72],"main":[73],"types":[74],"numeracy:":[77],"value":[78,124],"retrieval,":[79],"relational":[82,136],"comparison,":[83],"aggregation.":[85],"To":[86],"stress-test":[87],"robustness,":[88],"ClinicNumRobBench":[89,173],"presents":[90],"longitudinal":[91],"MIMIC-IV":[92],"vital-sign":[93],"records":[94],"three":[96],"semantically":[97],"equivalent":[98],"representations,":[99],"including":[100],"real-world":[102],"note-style":[103,166],"variant":[104],"derived":[105],"from":[106],"Open":[108],"Patients":[109],"dataset,":[110],"instantiates":[112],"queries":[113],"using":[114],"42":[115],"templates.":[117],"Experiments":[118],"on":[119,149,189],"17":[120],"show":[122],"retrieval":[125],"is":[126],"generally":[127],"strong,":[128],"most":[130],"models":[131,144,158],"exceeding":[132],"85%":[133],"accuracy,":[134],"while":[135],"comparison":[137],"aggregation":[139],"remain":[140],"challenging,":[141],"some":[143],"scoring":[145],"below":[146],"15%.":[147],"Fine-tuning":[148],"medical":[150],"data":[151,185],"can":[152],"reduce":[153],"numeracy":[154],"relative":[155],"base":[157],"by":[159],"over":[160],"30%,":[161],"performance":[163],"drops":[164],"under":[165],"variation":[167],"indicate":[168],"LLM":[169],"sensitivity":[170],"format.":[172],"offers":[174],"rigorous":[176],"testbed":[177],"clinically":[179],"reasoning.":[182],"Code":[183],"URL":[186],"available":[188],"https://github.com/MinhVuong2000/ClinicNumRobBench.":[190]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-15T00:00:00"}
