{"id":"https://openalex.org/W7162409075","doi":"https://doi.org/10.48550/arxiv.2605.25240","title":"JudgmentBench: Comparing Rubric and Preference Evaluation for Quality Assessment","display_name":"JudgmentBench: Comparing Rubric and Preference Evaluation for Quality Assessment","publication_year":2026,"publication_date":"2026-05-24","ids":{"openalex":"https://openalex.org/W7162409075","doi":"https://doi.org/10.48550/arxiv.2605.25240"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.25240","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25240","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.25240","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137055961","display_name":"Russell Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Russell","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067247111","display_name":"Ruishi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Ruishi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137010691","display_name":"Pierce Kelaita","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kelaita, Pierce","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005831772","display_name":"Riya Ranjan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ranjan, Riya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137012431","display_name":"Sibo Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Sibo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137063873","display_name":"Charles Dickens","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dickens, Charles","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119204479","display_name":"Matthew Guillod","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guillod, Matthew","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137033860","display_name":"Megan Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Megan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5017688180","display_name":"Julian Nyarko","orcid":"https://orcid.org/0000-0002-7121-5696"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nyarko, Julian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13643","display_name":"Artificial Intelligence in Law","score":0.41190001368522644,"subfield":{"id":"https://openalex.org/subfields/3320","display_name":"Political Science and International Relations"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13643","display_name":"Artificial Intelligence in Law","score":0.41190001368522644,"subfield":{"id":"https://openalex.org/subfields/3320","display_name":"Political Science and International Relations"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.13040000200271606,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.08980000019073486,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.965499997138977},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.7178999781608582},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5805000066757202},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.5146999955177307},{"id":"https://openalex.org/keywords/rank","display_name":"Rank (graph theory)","score":0.4690999984741211},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4645000100135803},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.4099999964237213},{"id":"https://openalex.org/keywords/inter-rater-reliability","display_name":"Inter-rater reliability","score":0.3528999984264374}],"concepts":[{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.965499997138977},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.7178999781608582},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5805000066757202},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.5146999955177307},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.4934000074863434},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.4690999984741211},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4645000100135803},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.462799996137619},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.4099999964237213},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3675999939441681},{"id":"https://openalex.org/C61863361","wikidata":"https://www.wikidata.org/wiki/Q470749","display_name":"Inter-rater reliability","level":3,"score":0.3528999984264374},{"id":"https://openalex.org/C3020001037","wikidata":"https://www.wikidata.org/wiki/Q836575","display_name":"Quality assessment","level":3,"score":0.337799996137619},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.3361999988555908},{"id":"https://openalex.org/C2779346075","wikidata":"https://www.wikidata.org/wiki/Q7268763","display_name":"Quality Score","level":3,"score":0.3278999924659729},{"id":"https://openalex.org/C204434341","wikidata":"https://www.wikidata.org/wiki/Q357789","display_name":"Adjudication","level":2,"score":0.3224000036716461},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3043999969959259},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.30300000309944153},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.30070000886917114},{"id":"https://openalex.org/C117220453","wikidata":"https://www.wikidata.org/wiki/Q5172842","display_name":"Correlation","level":2,"score":0.29899999499320984},{"id":"https://openalex.org/C53839665","wikidata":"https://www.wikidata.org/wiki/Q2067088","display_name":"Peer assessment","level":2,"score":0.2978000044822693},{"id":"https://openalex.org/C75630572","wikidata":"https://www.wikidata.org/wiki/Q538904","display_name":"Applied psychology","level":1,"score":0.2939000129699707},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.27709999680519104},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.272599995136261},{"id":"https://openalex.org/C3018395757","wikidata":"https://www.wikidata.org/wiki/Q1379672","display_name":"Evaluation methods","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C44083865","wikidata":"https://www.wikidata.org/wiki/Q3853443","display_name":"Mean reciprocal rank","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.25240","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25240","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.25240","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25240","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.7466195225715637}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Two":[0],"methodologies":[1,24],"dominate":[2],"current":[3],"practices":[4],"of":[5,40,128,177],"benchmarking:":[6],"rubric-based":[7],"scoring":[8],"evaluates":[9],"items":[10],"against":[11],"predefined":[12],"criteria,":[13],"whereas":[14],"comparative":[15,107],"judgment":[16,188],"elicits":[17],"pairwise":[18,52,141],"preferences":[19],"between":[20,30],"outputs.":[21],"Although":[22],"both":[23,80,119],"are":[25,83],"widely":[26],"used,":[27],"the":[28,69,86,90,110,158,174,178],"choice":[29],"them":[31],"is":[32],"rarely":[33],"justified.":[34],"We":[35],"release":[36],"JudgmentBench,":[37],"a":[38,75,120,139,181],"benchmark":[39],"30":[41],"real-world":[42],"legal":[43],"tasks,":[44],"paired":[45,175],"with":[46],"1,539":[47],"rubric":[48],"scores":[49],"and":[50,138,167,193],"1,530":[51],"preference":[53],"judgments":[54,108],"collected":[55],"from":[56,85],"practicing":[57],"attorneys--including":[58],"at":[59,96],"major":[60],"U.S.":[61],"law":[62],"firms--with":[63],"substantial":[64],"experience.":[65],"The":[66,161],"annotations":[67],"constitute":[68],"first":[70],"publicly":[71],"available":[72],"dataset":[73,179],"in":[74,78,197],"high-expertise":[76],"domain":[77],"which":[79],"supervision":[81,196],"signals":[82],"elicited":[84],"same":[87,91],"experts":[88],"on":[89,185],"items.":[92],"Using":[93],"LLM-generated":[94],"outputs":[95],"three":[97],"constructed":[98],"quality":[99,112],"levels,":[100],"we":[101],"provide":[102],"an":[103],"initial":[104,172],"empirical":[105],"comparison:":[106],"recover":[109],"intended":[111],"ordering":[113],"substantially":[114],"better":[115],"than":[116,156],"rubrics":[117],"under":[118],"per-task":[121],"rank-correlation":[122],"metric":[123,143],"(mean":[124],"Spearman's":[125],"rank":[126],"correlation":[127],"0.908":[129],"vs.":[130,145],"0.150,":[131],"estimated":[132,147],"difference":[133,148],"=":[134,149],"0.758":[135],"[0.494,":[136],"1.021])":[137],"per-judgment":[140],"win-rate":[142],"(0.669":[144],"0.542,":[146],"0.127":[150],"[0.067,":[151],"0.186]),":[152],"while":[153],"requiring":[154],"less":[155],"half":[157],"annotation":[159],"time.":[160],"patterns":[162],"hold":[163],"for":[164],"human":[165],"annotators":[166],"LLM":[168],"autograders.":[169],"Beyond":[170],"this":[171],"comparison,":[173],"structure":[176],"supports":[180],"broader":[182],"research":[183],"agenda":[184],"how":[186],"expert":[187],"should":[189],"be":[190],"elicited,":[191],"aggregated,":[192],"used":[194],"as":[195],"domains":[198],"without":[199],"verifiable":[200],"ground":[201],"truth.":[202]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-27T00:00:00"}
