{"id":"https://openalex.org/W7162794812","doi":"https://doi.org/10.48550/arxiv.2605.30104","title":"SEAL: Can Saturated Benchmarks Be Revived by LLM-as-a-Meta-Judge?","display_name":"SEAL: Can Saturated Benchmarks Be Revived by LLM-as-a-Meta-Judge?","publication_year":2026,"publication_date":"2026-05-28","ids":{"openalex":"https://openalex.org/W7162794812","doi":"https://doi.org/10.48550/arxiv.2605.30104"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.30104","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.30104","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.30104","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137373561","display_name":"Jiamin Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiamin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033535840","display_name":"Yidi Wu","orcid":"https://orcid.org/0000-0001-8204-7395"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yidi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090362440","display_name":"Qiexiang Wang","orcid":"https://orcid.org/0000-0002-8037-8227"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qiexiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011725324","display_name":"Qianben Chen","orcid":"https://orcid.org/0000-0003-2546-0225"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Qianben","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137324014","display_name":"Yuchen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yuchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137344659","display_name":"Yansen Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yansen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137356627","display_name":"Xiaokun Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xiaokun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137392877","display_name":"Wangchunshu Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Wangchunshu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137317928","display_name":"Chen Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.76419997215271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.76419997215271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.04989999905228615,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.028699999675154686,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.8134999871253967},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6557999849319458},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.6011999845504761},{"id":"https://openalex.org/keywords/protocol","display_name":"Protocol (science)","score":0.5636000037193298},{"id":"https://openalex.org/keywords/checklist","display_name":"Checklist","score":0.4593000113964081},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.37470000982284546},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.35030001401901245}],"concepts":[{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.8134999871253967},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6557999849319458},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.6011999845504761},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5856999754905701},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.5636000037193298},{"id":"https://openalex.org/C2779356329","wikidata":"https://www.wikidata.org/wiki/Q922625","display_name":"Checklist","level":2,"score":0.4593000113964081},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.39890000224113464},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39320001006126404},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.37470000982284546},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3630000054836273},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.35030001401901245},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3458999991416931},{"id":"https://openalex.org/C2777755289","wikidata":"https://www.wikidata.org/wiki/Q162919","display_name":"Seal (emblem)","level":2,"score":0.3424000144004822},{"id":"https://openalex.org/C90329073","wikidata":"https://www.wikidata.org/wiki/Q914232","display_name":"Ask price","level":2,"score":0.304500013589859},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.3043000102043152},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.28029999136924744},{"id":"https://openalex.org/C3018395757","wikidata":"https://www.wikidata.org/wiki/Q1379672","display_name":"Evaluation methods","level":2,"score":0.2639000117778778},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.2574999928474426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.30104","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.30104","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.30104","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.30104","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Widely":[0],"used":[1],"language-model":[2],"benchmarks":[3,87],"are":[4],"increasingly":[5],"saturated,":[6],"with":[7,47,74,116,132],"frontier":[8],"systems":[9],"often":[10],"receiving":[11],"near-tied":[12],"scores":[13],"that":[14],"standard":[15],"metrics":[16],"cannot":[17],"resolve.":[18],"Rather":[19],"than":[20],"constructing":[21],"harder":[22],"alternatives,":[23],"we":[24,43],"ask":[25],"whether":[26],"existing":[27],"tasks":[28],"can":[29],"be":[30],"made":[31],"informative":[32],"again":[33],"through":[34],"improved":[35],"evaluation":[36,52],"over":[37,109],"the":[38,106],"same":[39],"candidate":[40,64],"outputs.":[41],"Therefore,":[42],"present":[44],"Seeded":[45],"Elimination":[46],"Adaptive":[48],"LLM-as-a-Meta-Judge,":[49],"a":[50,67],"self-improving":[51,78],"protocol":[53],"for":[54,134],"extracting":[55],"latent":[56],"ranking":[57],"signal":[58],"from":[59],"saturated":[60,86],"benchmarks.":[61],"SEAL":[62,83,104],"seeds":[63],"outputs":[65],"into":[66],"single":[68],"elimination":[69],"and":[70,96,120],"evaluates":[71],"each":[72],"match":[73],"task-level":[75],"principles":[76],"plus":[77],"checklist":[79],"criteria.":[80],"We":[81],"evaluate":[82],"on":[84],"multiple":[85],"covering":[88],"code":[89],"generation,":[90],"mathematical":[91],"reasoning,":[92],"knowledge-intensive":[93],"question":[94],"answering,":[95],"tool-use":[97],"agent":[98],"task":[99,130],"completion.":[100],"Across":[101],"these":[102],"settings,":[103],"improves":[105],"ranking-accuracy--latency":[107],"trade-off":[108],"competing":[110],"protocols,":[111],"attaining":[112],"0.83--1.00":[113],"Spearman":[114],"agreement":[115],"full":[117,135],"pairwise":[118,136],"judging":[119],"4/4":[121],"top-1":[122],"agreement,":[123],"while":[124],"requiring":[125],"only":[126],"11.89":[127],"calls":[128],"per":[129],"compared":[131],"28.00":[133],"evaluation.":[137]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-30T00:00:00"}
