{"id":"https://openalex.org/W7155034440","doi":"https://doi.org/10.48550/arxiv.2604.16383","title":"Same Verdict, Different Reasons: LLM-as-a-Judge and Clinician Disagreement on Medical Chatbot Completeness","display_name":"Same Verdict, Different Reasons: LLM-as-a-Judge and Clinician Disagreement on Medical Chatbot Completeness","publication_year":2026,"publication_date":"2026-03-26","ids":{"openalex":"https://openalex.org/W7155034440","doi":"https://doi.org/10.48550/arxiv.2604.16383"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.16383","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16383","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.16383","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045330788","display_name":"Alexandra DeLucia","orcid":"https://orcid.org/0000-0003-4325-9170"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"DeLucia, Alexandra","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134201142","display_name":"Heyuan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Heyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052564654","display_name":"Sonal Joshi","orcid":"https://orcid.org/0000-0001-8020-7551"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joshi, Sonal","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067578148","display_name":"Mahsa Yarmohammadi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yarmohammadi, Mahsa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134190784","display_name":"Ahmed Hassoon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hassoon, Ahmed","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134130139","display_name":"Mark Dredze","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dredze, Mark","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5045330788"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.8420000076293945,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.8420000076293945,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.0203000009059906,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.016100000590085983,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/triage","display_name":"Triage","score":0.6611999869346619},{"id":"https://openalex.org/keywords/false-positive-paradox","display_name":"False positive paradox","score":0.6406999826431274},{"id":"https://openalex.org/keywords/completeness","display_name":"Completeness (order theory)","score":0.6098999977111816},{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.5367000102996826},{"id":"https://openalex.org/keywords/false-positives-and-false-negatives","display_name":"False positives and false negatives","score":0.4627000093460083},{"id":"https://openalex.org/keywords/recall","display_name":"Recall","score":0.4449000060558319},{"id":"https://openalex.org/keywords/chatbot","display_name":"Chatbot","score":0.40310001373291016},{"id":"https://openalex.org/keywords/medical-diagnosis","display_name":"Medical diagnosis","score":0.4009999930858612},{"id":"https://openalex.org/keywords/statistician","display_name":"Statistician","score":0.3644999861717224}],"concepts":[{"id":"https://openalex.org/C2777120189","wikidata":"https://www.wikidata.org/wiki/Q780067","display_name":"Triage","level":2,"score":0.6611999869346619},{"id":"https://openalex.org/C64869954","wikidata":"https://www.wikidata.org/wiki/Q1859747","display_name":"False positive paradox","level":2,"score":0.6406999826431274},{"id":"https://openalex.org/C17231256","wikidata":"https://www.wikidata.org/wiki/Q5156540","display_name":"Completeness (order theory)","level":2,"score":0.6098999977111816},{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.5367000102996826},{"id":"https://openalex.org/C112789634","wikidata":"https://www.wikidata.org/wiki/Q18207010","display_name":"False positives and false negatives","level":3,"score":0.4627000093460083},{"id":"https://openalex.org/C100660578","wikidata":"https://www.wikidata.org/wiki/Q18733","display_name":"Recall","level":2,"score":0.4449000060558319},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4377000033855438},{"id":"https://openalex.org/C2779041454","wikidata":"https://www.wikidata.org/wiki/Q870780","display_name":"Chatbot","level":2,"score":0.40310001373291016},{"id":"https://openalex.org/C534262118","wikidata":"https://www.wikidata.org/wiki/Q177719","display_name":"Medical diagnosis","level":2,"score":0.4009999930858612},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.38989999890327454},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.3853999972343445},{"id":"https://openalex.org/C2779677306","wikidata":"https://www.wikidata.org/wiki/Q2732142","display_name":"Statistician","level":2,"score":0.3644999861717224},{"id":"https://openalex.org/C59577422","wikidata":"https://www.wikidata.org/wiki/Q10265143","display_name":"False accusation","level":2,"score":0.3571000099182129},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.35040000081062317},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.3273000121116638},{"id":"https://openalex.org/C2780572471","wikidata":"https://www.wikidata.org/wiki/Q18206027","display_name":"Clinical Ethics","level":2,"score":0.3059000074863434},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.30230000615119934},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.3001999855041504},{"id":"https://openalex.org/C2779473830","wikidata":"https://www.wikidata.org/wiki/Q1540899","display_name":"MEDLINE","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C2989486834","wikidata":"https://www.wikidata.org/wiki/Q3808900","display_name":"True positive rate","level":2,"score":0.2793000042438507},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2662999927997589},{"id":"https://openalex.org/C3020452639","wikidata":"https://www.wikidata.org/wiki/Q454812","display_name":"Clinical judgment","level":2,"score":0.26579999923706055},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.26510000228881836},{"id":"https://openalex.org/C2779338814","wikidata":"https://www.wikidata.org/wiki/Q5179285","display_name":"Covert","level":2,"score":0.26109999418258667},{"id":"https://openalex.org/C3018949938","wikidata":"https://www.wikidata.org/wiki/Q17166101","display_name":"Text messaging","level":2,"score":0.25589999556541443}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.16383","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16383","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.16383","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16383","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6633118391036987,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"LLM-as-a-Judge":[0],"frameworks":[1],"are":[2],"increasingly":[3],"trusted":[4],"to":[5,77],"automate":[6],"evaluation":[7],"in":[8,16,153],"place":[9],"of":[10,80,90],"human":[11],"experts,":[12],"yet":[13],"their":[14,145],"reliability":[15],"high-stakes":[17],"medical":[18,30,55],"contexts":[19],"remains":[20],"unproven.":[21],"We":[22],"stress-test":[23],"this":[24],"assumption":[25],"for":[26,54],"detecting":[27],"incomplete":[28,63,81],"patient-facing":[29],"responses,":[31,82],"evaluating":[32],"three":[33,40],"rubric":[34],"granularities":[35],"(General-Likert,":[36],"Analytical-Rubric,":[37],"Dynamic-Checklist)":[38],"and":[39,66,100,110,134],"backbone":[41],"models":[42],"across":[43],"two":[44],"clinician-annotated":[45],"datasets,":[46],"including":[47],"HealthBench,":[48],"the":[49,74,87,91,107],"largest":[50],"publicly":[51],"available":[52],"benchmark":[53],"response":[56],"evaluation.":[57],"LLM":[58,132],"Judges":[59,133],"discriminate":[60],"complete":[61],"from":[62,117],"responses":[64],"at":[65,73],"slightly":[67],"above":[68],"near":[69],"chance":[70],"(AUC":[71],"$0.49$--$0.66$);":[72],"threshold":[75],"required":[76],"recall":[78],"$90\\%$":[79],"clinicians":[83,135],"must":[84],"still":[85],"review":[86],"vast":[88],"majority":[89],"dataset,":[92],"offering":[93],"no":[94],"triage":[95,151],"utility.":[96],"Even":[97],"when":[98,111],"model":[99],"clinician":[101],"verdicts":[102],"agree,":[103],"they":[104,112],"rarely":[105],"cite":[106],"same":[108],"explanation;":[109],"diverge,":[113],"false":[114,122],"positives":[115],"stem":[116],"over-flagging":[118],"non-essential":[119],"gaps":[120],"while":[121],"negatives":[123],"reflect":[124],"outright":[125],"detection":[126],"failures.":[127],"These":[128],"results":[129],"reveal":[130],"that":[131,143],"apply":[136],"fundamentally":[137],"different":[138],"completeness":[139],"standards;":[140],"a":[141],"finding":[142],"undermines":[144],"use":[146],"as":[147],"autonomous":[148],"evaluators":[149],"or":[150],"filters":[152],"clinical":[154],"settings.":[155]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-22T00:00:00"}
