{"id":"https://openalex.org/W7148448845","doi":"https://doi.org/10.48550/arxiv.2604.00008","title":"How Trustworthy Are LLM-as-Judge Ratings for Interpretive Responses? Implications for Qualitative Research Workflows","display_name":"How Trustworthy Are LLM-as-Judge Ratings for Interpretive Responses? Implications for Qualitative Research Workflows","publication_year":2026,"publication_date":"2026-03-09","ids":{"openalex":"https://openalex.org/W7148448845","doi":"https://doi.org/10.48550/arxiv.2604.00008"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00008","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00008","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00008","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132768617","display_name":"Songhee Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Han, Songhee","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132778624","display_name":"Jueun Shin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shin, Jueun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132826151","display_name":"Jiyoon Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Jiyoon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132743057","display_name":"Bung-Woo Jun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jun, Bung-Woo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132750947","display_name":"Hilal Ayan Karabatman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Karabatman, Hilal Ayan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5132768617"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.2240000069141388,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.2240000069141388,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12880","display_name":"Qualitative Research Methods and Applications","score":0.07800000160932541,"subfield":{"id":"https://openalex.org/subfields/3304","display_name":"Education"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.05490000173449516,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.6261000037193298},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.6046000123023987},{"id":"https://openalex.org/keywords/qualitative-research","display_name":"Qualitative research","score":0.5949000120162964},{"id":"https://openalex.org/keywords/trustworthiness","display_name":"Trustworthiness","score":0.5289000272750854},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4973999857902527},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.4456999897956848},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.38359999656677246},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.36250001192092896}],"concepts":[{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.6261000037193298},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.6046000123023987},{"id":"https://openalex.org/C190248442","wikidata":"https://www.wikidata.org/wiki/Q839486","display_name":"Qualitative research","level":2,"score":0.5949000120162964},{"id":"https://openalex.org/C153701036","wikidata":"https://www.wikidata.org/wiki/Q659974","display_name":"Trustworthiness","level":2,"score":0.5289000272750854},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.5142999887466431},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4973999857902527},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.47119998931884766},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.4456999897956848},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.38440001010894775},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.38359999656677246},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.3790999948978424},{"id":"https://openalex.org/C75630572","wikidata":"https://www.wikidata.org/wiki/Q538904","display_name":"Applied psychology","level":1,"score":0.37630000710487366},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3628999888896942},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.36250001192092896},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.3562999963760376},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.3549000024795532},{"id":"https://openalex.org/C3018587665","wikidata":"https://www.wikidata.org/wiki/Q7268696","display_name":"Qualitative analysis","level":3,"score":0.3467000126838684},{"id":"https://openalex.org/C87156501","wikidata":"https://www.wikidata.org/wiki/Q7268708","display_name":"Qualitative property","level":2,"score":0.3425999879837036},{"id":"https://openalex.org/C2781466463","wikidata":"https://www.wikidata.org/wiki/Q621695","display_name":"Blame","level":2,"score":0.3125999867916107},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.29409998655319214},{"id":"https://openalex.org/C158600405","wikidata":"https://www.wikidata.org/wiki/Q5054566","display_name":"Causal inference","level":2,"score":0.29190000891685486},{"id":"https://openalex.org/C2777207495","wikidata":"https://www.wikidata.org/wiki/Q2072146","display_name":"Personnel selection","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C189708586","wikidata":"https://www.wikidata.org/wiki/Q1504425","display_name":"Systematic review","level":3,"score":0.2793000042438507},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.2736000120639801},{"id":"https://openalex.org/C107645774","wikidata":"https://www.wikidata.org/wiki/Q5467169","display_name":"Human resources","level":2,"score":0.27070000767707825},{"id":"https://openalex.org/C169806903","wikidata":"https://www.wikidata.org/wiki/Q5937752","display_name":"Human error","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C56995899","wikidata":"https://www.wikidata.org/wiki/Q1126687","display_name":"Focus group","level":2,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00008","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00008","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00008","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00008","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5976231694221497,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"qualitative":[1,240],"researchers":[2],"show":[3,151],"growing":[4],"interest":[5],"in":[6,159,169,239],"using":[7,92,121],"automated":[8,173],"tools":[9],"to":[10,206],"support":[11],"interpretive":[12,32,50,68,90,143,148,207],"analysis,":[13],"a":[14,130],"large":[15],"language":[16],"model":[17,41,164],"(LLM)":[18],"is":[19],"often":[20],"introduced":[21],"into":[22],"an":[23],"analytic":[24],"workflow":[25],"as":[26],"is,":[27],"without":[28],"systematic":[29,189,233],"evaluation":[30],"of":[31,67,133,237],"quality":[33,69],"or":[34,220],"comparison":[35,234],"across":[36,126],"models.":[37],"This":[38],"practice":[39],"leaves":[40],"selection":[42,236],"largely":[43,204],"unexamined":[44],"despite":[45],"its":[46],"potential":[47],"influence":[48],"on":[49,142],"outcomes.":[51],"To":[52],"address":[53],"this":[54,56],"gap,":[55],"study":[57],"examines":[58],"whether":[59],"LLM-as-judge":[60,124,153,213],"evaluations":[61,118,161],"meaningfully":[62],"align":[63],"with":[64,83,180],"human":[65,140,160,182,227],"judgments":[66],"and":[70,112,129,147,186,198,235],"can":[71],"inform":[72],"model-level":[73],"decision":[74],"making.":[75],"Using":[76],"712":[77],"conversational":[78],"excerpts":[79],"from":[80],"semi-structured":[81],"interviews":[82],"K-12":[84],"mathematics":[85],"teachers,":[86],"we":[87],"generated":[88],"one-sentence":[89],"responses":[91,134],"five":[93,127],"widely":[94],"adopted":[95],"inference":[96],"models:":[97],"Command":[98],"R+":[99],"(Cohere),":[100],"Gemini":[101],"2.5":[102],"Pro":[103],"(Google),":[104],"GPT-5.1":[105],"(OpenAI),":[106],"Llama":[107],"4":[108],"Scout-17B":[109],"Instruct":[110],"(Meta),":[111],"Qwen":[113],"3-32B":[114],"Dense":[115],"(Alibaba).":[116],"Automated":[117],"were":[119,203],"conducted":[120],"AWS":[122],"Bedrock's":[123],"framework":[125],"metrics,":[128,174],"stratified":[131],"subset":[132],"was":[135],"independently":[136],"rated":[137],"by":[138],"trained":[139],"evaluators":[141],"accuracy,":[144],"nuance":[145],"preservation,":[146],"coherence.":[149],"Results":[150],"that":[152,212],"scores":[154],"capture":[155],"broad":[156],"directional":[157],"trends":[158],"at":[162,191],"the":[163,177,192],"level":[165],"but":[166],"diverge":[167],"substantially":[168],"score":[170],"magnitude.":[171],"Among":[172],"Coherence":[175],"showed":[176],"strongest":[178],"alignment":[179],"aggregated":[181],"ratings,":[183],"whereas":[184],"Faithfulness":[185],"Correctness":[187],"revealed":[188],"misalignment":[190],"excerpt":[193],"level,":[194],"particularly":[195],"for":[196,218,225,232],"non-literal":[197],"nuanced":[199],"interpretations.":[200],"Safety-related":[201],"metrics":[202],"irrelevant":[205],"quality.":[208],"These":[209],"findings":[210],"suggest":[211],"methods":[214],"are":[215],"better":[216],"suited":[217],"screening":[219],"eliminating":[221],"underperforming":[222],"models":[223],"than":[224],"replacing":[226],"judgment,":[228],"offering":[229],"practical":[230],"guidance":[231],"LLMs":[238],"research":[241],"workflows.":[242]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
