{"id":"https://openalex.org/W7133295997","doi":"https://doi.org/10.48550/arxiv.2603.00314","title":"When Metrics Disagree: Automatic Similarity vs. LLM-as-a-Judge for Clinical Dialogue Evaluation","display_name":"When Metrics Disagree: Automatic Similarity vs. LLM-as-a-Judge for Clinical Dialogue Evaluation","publication_year":2026,"publication_date":"2026-02-27","ids":{"openalex":"https://openalex.org/W7133295997","doi":"https://doi.org/10.48550/arxiv.2603.00314"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00314","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00314","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00314","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127965719","display_name":"Bian Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sun, Bian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029238394","display_name":"Z Wang","orcid":"https://orcid.org/0000-0002-1331-4207"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhenjian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128010455","display_name":"Orvill de la Torre","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"de la Torre, Orvill","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127986205","display_name":"Zirui Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zirui","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5127965719"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.7336999773979187,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.7336999773979187,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.10559999942779541,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.058400001376867294,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6439999938011169},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5764999985694885},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5640000104904175},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.5088000297546387},{"id":"https://openalex.org/keywords/health-care","display_name":"Health care","score":0.4163999855518341},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.400299996137619}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6517999768257141},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6439999938011169},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5764999985694885},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5640000104904175},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5454000234603882},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.5088000297546387},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5080999732017517},{"id":"https://openalex.org/C160735492","wikidata":"https://www.wikidata.org/wiki/Q31207","display_name":"Health care","level":2,"score":0.4163999855518341},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.400299996137619},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.38179999589920044},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.35899999737739563},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3287999927997589},{"id":"https://openalex.org/C106977388","wikidata":"https://www.wikidata.org/wiki/Q2752427","display_name":"Medical research","level":2,"score":0.304500013589859},{"id":"https://openalex.org/C163763905","wikidata":"https://www.wikidata.org/wiki/Q17075943","display_name":"Precision medicine","level":2,"score":0.28380000591278076},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.26159998774528503}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00314","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00314","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00314","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00314","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"Large":[1],"Language":[2],"Models":[3],"(LLMs)":[4],"are":[5],"increasingly":[6],"integrated":[7],"into":[8,61],"healthcare":[9,225],"to":[10,86,94,106],"address":[11],"complex":[12],"inquiries,":[13],"ensuring":[14],"their":[15],"reliability":[16],"remains":[17,214],"a":[18,120,161,181],"critical":[19],"challenge.":[20],"Recent":[21],"studies":[22],"have":[23],"highlighted":[24],"that":[25,148,196],"generic":[26],"LLMs":[27,223],"often":[28],"struggle":[29],"in":[30,92,165,224],"clinical":[31,103,191],"contexts,":[32],"occasionally":[33],"producing":[34],"misleading":[35],"guidance.":[36],"To":[37],"mitigate":[38],"these":[39],"risks,":[40],"this":[41],"research":[42],"focuses":[43],"on":[44],"the":[45,51,62,68,76,80,99,107,114,150,166,172,219],"domain-specific":[46],"adaptation":[47],"of":[48,79,102,110,222],"\\textbf{Llama-2-7B}":[49],"using":[50,70,140],"\\textbf{Low-Rank":[52],"Adaptation":[53],"(LoRA)}":[54],"technique.":[55],"By":[56],"injecting":[57],"trainable":[58],"low-rank":[59],"matrices":[60],"Transformer":[63],"layers,":[64],"we":[65,194],"efficiently":[66],"adapted":[67],"model":[69,152],"authentic":[71],"patient-physician":[72],"transcripts":[73],"while":[74,133,149,197],"preserving":[75],"foundational":[77],"knowledge":[78],"base":[81],"model.":[82],"Our":[83,145],"objective":[84],"was":[85,117],"enhance":[87],"precision":[88],"and":[89,200],"contextual":[90],"relevance":[91],"responding":[93],"medical":[95,212],"queries":[96],"by":[97,210],"capturing":[98],"specialized":[100],"nuances":[101],"discourse.":[104],"Due":[105],"resource-intensive":[108],"nature":[109],"large-scale":[111],"human":[112,211],"validation,":[113],"model's":[115,174],"performance":[116],"evaluated":[118],"through":[119],"dual-track":[121],"framework:":[122],"\\textbf{Track":[123,134],"A}":[124],"utilized":[125],"traditional":[126,184],"lexical":[127,159],"similarity":[128],"metrics":[129,199],"(e.g.,":[130],"BLEU,":[131],"ROUGE),":[132],"B}":[135],"employed":[136],"an":[137,215],"\"LLM-as-a-Judge\"":[138],"paradigm":[139],"GPT-4":[141,167],"for":[142,218],"semantic":[143],"assessment.":[144],"results":[146],"demonstrate":[147],"LoRA-enhanced":[151],"achieved":[153],"significant":[154],"improvements":[155],"across":[156],"all":[157],"quantitative":[158],"dimensions,":[160],"profound":[162],"disagreement":[163],"surfaced":[164],"evaluation,":[168],"which":[169],"marginally":[170],"favored":[171],"baseline":[173],"conversational":[175],"flow.":[176],"This":[177],"metric":[178],"divergence":[179],"underscores":[180],"pivotal":[182],"finding:":[183],"automated":[185,198],"scores":[186],"may":[187],"not":[188],"fully":[189],"reflect":[190],"utility.":[192],"Consequently,":[193],"propose":[195],"LLM":[201],"judges":[202],"serve":[203],"as":[204],"valuable":[205],"developmental":[206],"proxies,":[207],"rigorous":[208],"validation":[209],"experts":[213],"indispensable":[216],"requirement":[217],"safe":[220],"deployment":[221],"settings.":[226]},"counts_by_year":[],"updated_date":"2026-04-02T13:48:15.688549","created_date":"2026-03-04T00:00:00"}
