{"id":"https://openalex.org/W7135066673","doi":"https://doi.org/10.48550/arxiv.2603.11027","title":"Beyond the Illusion of Consensus: From Surface Heuristics to Knowledge-Grounded Evaluation in LLM-as-a-Judge","display_name":"Beyond the Illusion of Consensus: From Surface Heuristics to Knowledge-Grounded Evaluation in LLM-as-a-Judge","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135066673","doi":"https://doi.org/10.48550/arxiv.2603.11027"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.11027","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11027","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.11027","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128886402","display_name":"Mingyang Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Song, Mingyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128833748","display_name":"Mao Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Mao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5057227372","display_name":"Chenning Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Chenning","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5128886402"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7143999934196472,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7143999934196472,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.03310000151395798,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.026000000536441803,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.973800003528595},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.8317999839782715},{"id":"https://openalex.org/keywords/pluralism","display_name":"Pluralism (philosophy)","score":0.4519999921321869},{"id":"https://openalex.org/keywords/phenomenon","display_name":"Phenomenon","score":0.4503999948501587},{"id":"https://openalex.org/keywords/toolbox","display_name":"Toolbox","score":0.44909998774528503},{"id":"https://openalex.org/keywords/illusion","display_name":"Illusion","score":0.42489999532699585},{"id":"https://openalex.org/keywords/domain-knowledge","display_name":"Domain knowledge","score":0.412200003862381}],"concepts":[{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.973800003528595},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.8317999839782715},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5389999747276306},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.46219998598098755},{"id":"https://openalex.org/C49831778","wikidata":"https://www.wikidata.org/wiki/Q879021","display_name":"Pluralism (philosophy)","level":2,"score":0.4519999921321869},{"id":"https://openalex.org/C50335755","wikidata":"https://www.wikidata.org/wiki/Q483247","display_name":"Phenomenon","level":2,"score":0.4503999948501587},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.44909998774528503},{"id":"https://openalex.org/C184047640","wikidata":"https://www.wikidata.org/wiki/Q182593","display_name":"Illusion","level":2,"score":0.42489999532699585},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.41850000619888306},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.412200003862381},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3659000098705292},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35100001096725464},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.34869998693466187},{"id":"https://openalex.org/C2781316041","wikidata":"https://www.wikidata.org/wiki/Q1230584","display_name":"Diversity (politics)","level":2,"score":0.3474999964237213},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.34599998593330383},{"id":"https://openalex.org/C204434341","wikidata":"https://www.wikidata.org/wiki/Q357789","display_name":"Adjudication","level":2,"score":0.33340001106262207},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.32359999418258667},{"id":"https://openalex.org/C64543145","wikidata":"https://www.wikidata.org/wiki/Q162942","display_name":"Intersection (aeronautics)","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C2781035248","wikidata":"https://www.wikidata.org/wiki/Q186150","display_name":"Fallacy","level":2,"score":0.2687999904155731},{"id":"https://openalex.org/C2776818064","wikidata":"https://www.wikidata.org/wiki/Q829903","display_name":"Agreement","level":2,"score":0.2680000066757202}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.11027","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11027","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.11027","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11027","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.709415078163147,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"paradigm":[1],"of":[2,66,110],"LLM-as-a-judge":[3],"relies":[4],"on":[5,54,169,199],"a":[6,43,63,146],"critical":[7],"assumption,":[8],"namely":[9],"that":[10,24,31,84,103,114,126,186],"high":[11],"inter-evaluator":[12],"agreement":[13,86,93,99],"indicates":[14],"reliable":[15],"and":[16,39,113],"objective":[17],"evaluation.":[18],"We":[19,37,139],"present":[20],"two":[21],"complementary":[22],"findings":[23,184],"challenge":[25],"this":[26,32],"assumption.":[27],"\\textbf{First},":[28],"we":[29,82,124],"demonstrate":[30,125],"consensus":[33],"is":[34],"frequently":[35],"illusory.":[36],"identify":[38],"formalize":[40],"\\textbf{Evaluation":[41],"Illusion},":[42],"phenomenon":[44],"where":[45,165,178],"LLM":[46],"judges":[47,75],"generate":[48],"sophisticated":[49],"critiques":[50],"yet":[51],"anchor":[52],"scores":[53],"shared":[55,170],"surface":[56],"heuristics":[57],"rather":[58,196],"than":[59,197],"substantive":[60],"quality.":[61],"Through":[62],"large-scale":[64],"study":[65],"105,600":[67],"evaluation":[68,129,187],"instances":[69],"(32":[70],"LLMs":[71],"$\\times$":[72,76,79],"3":[73],"frontier":[74],"100":[77],"tasks":[78],"11":[80],"temperatures),":[81],"show":[83],"model-level":[85],"(Spearman":[87],"$\u03c1=":[88],"0.99$)":[89],"masks":[90],"fragile":[91],"sample-level":[92],"(Pearson":[94],"$\\bar{r}":[95],"=":[96],"0.72$;":[97],"absolute":[98],"ICC":[100],"$=":[101],"0.67$),":[102],"merely":[104],"sharing":[105],"rubric":[106,148],"structure":[107],"restores":[108],"62\\%":[109],"total":[111],"agreement,":[112],"high-quality":[115],"outputs":[116],"paradoxically":[117],"receive":[118],"the":[119],"\\textit{least}":[120],"consistent":[121],"evaluations.":[122],"\\textbf{Second},":[123],"dynamically":[127,191],"generating":[128],"rubrics":[130,188],"grounded":[131],"in":[132,158,175,207],"domain":[133],"knowledge":[134,166,195],"produces":[135],"more":[136],"meaningful":[137],"assessment.":[138],"introduce":[140],"MERG":[141],"(Metacognitive":[142],"Enhanced":[143],"Rubric":[144],"Generation),":[145],"knowledge-driven":[147],"generation":[149],"framework":[150],"whose":[151],"domain-selective":[152],"effects":[153],"confirm":[154],"this.":[155],"Agreement":[156],"\\textit{increases}":[157],"codified":[159],"domains":[160,177],"(Education":[161],"+22\\%,":[162],"Academic":[163],"+27\\%)":[164],"anchors":[167],"evaluators":[168],"standards,":[171],"while":[172],"it":[173],"decreases":[174],"subjective":[176],"genuine":[179],"evaluative":[180],"pluralism":[181],"emerges.":[182],"These":[183],"suggest":[185],"should":[189],"be":[190],"enriched":[192],"with":[193,202],"expert":[194],"relying":[198],"generic":[200],"criteria,":[201],"implications":[203],"for":[204],"reward":[205],"modeling":[206],"RLAIF.":[208]},"counts_by_year":[],"updated_date":"2026-03-13T14:25:03.468858","created_date":"2026-03-13T00:00:00"}
