{"id":"https://openalex.org/W7125251671","doi":"https://doi.org/10.48550/arxiv.2601.13885","title":"Confident Rankings with Fewer Items: Adaptive LLM Evaluation with Continuous Scores","display_name":"Confident Rankings with Fewer Items: Adaptive LLM Evaluation with Continuous Scores","publication_year":2026,"publication_date":"2026-01-20","ids":{"openalex":"https://openalex.org/W7125251671","doi":"https://doi.org/10.48550/arxiv.2601.13885"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.13885","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.13885","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.13885","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034130220","display_name":"Esma Balk\u0131r","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Balk\u0131r, Esma","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123540995","display_name":"Alice Pernthaller","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pernthaller, Alice","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088741264","display_name":"Marco Basaldella","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Basaldella, Marco","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123498224","display_name":"Jos\u00e9 Hern\u00e1ndez-Orallo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hern\u00e1ndez-Orallo, Jos\u00e9","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123464259","display_name":"Nigel Collier","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Collier, Nigel","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5034130220"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10467","display_name":"Psychometric Methodologies and Testing","score":0.42910000681877136,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10467","display_name":"Psychometric Methodologies and Testing","score":0.42910000681877136,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.09910000115633011,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.041099999099969864,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.7386999726295471},{"id":"https://openalex.org/keywords/computerized-adaptive-testing","display_name":"Computerized adaptive testing","score":0.5406000018119812},{"id":"https://openalex.org/keywords/extension","display_name":"Extension (predicate logic)","score":0.4223000109195709},{"id":"https://openalex.org/keywords/heteroscedasticity","display_name":"Heteroscedasticity","score":0.4058000147342682},{"id":"https://openalex.org/keywords/item-response-theory","display_name":"Item response theory","score":0.38580000400543213},{"id":"https://openalex.org/keywords/test","display_name":"Test (biology)","score":0.38260000944137573},{"id":"https://openalex.org/keywords/bernoullis-principle","display_name":"Bernoulli's principle","score":0.38179999589920044}],"concepts":[{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.7386999726295471},{"id":"https://openalex.org/C144352353","wikidata":"https://www.wikidata.org/wiki/Q2920411","display_name":"Computerized adaptive testing","level":3,"score":0.5406000018119812},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5304999947547913},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5181000232696533},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4569999873638153},{"id":"https://openalex.org/C2778029271","wikidata":"https://www.wikidata.org/wiki/Q5421931","display_name":"Extension (predicate logic)","level":2,"score":0.4223000109195709},{"id":"https://openalex.org/C101104100","wikidata":"https://www.wikidata.org/wiki/Q1063540","display_name":"Heteroscedasticity","level":2,"score":0.4058000147342682},{"id":"https://openalex.org/C19875794","wikidata":"https://www.wikidata.org/wiki/Q1207340","display_name":"Item response theory","level":3,"score":0.38580000400543213},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.38260000944137573},{"id":"https://openalex.org/C152361515","wikidata":"https://www.wikidata.org/wiki/Q181328","display_name":"Bernoulli's principle","level":2,"score":0.38179999589920044},{"id":"https://openalex.org/C2982777018","wikidata":"https://www.wikidata.org/wiki/Q99268086","display_name":"Adaptive design","level":3,"score":0.3528999984264374},{"id":"https://openalex.org/C34388435","wikidata":"https://www.wikidata.org/wiki/Q2267362","display_name":"Bounded function","level":2,"score":0.34529998898506165},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3425999879837036},{"id":"https://openalex.org/C80478641","wikidata":"https://www.wikidata.org/wiki/Q195771","display_name":"Sequential analysis","level":2,"score":0.3352000117301941},{"id":"https://openalex.org/C117220453","wikidata":"https://www.wikidata.org/wiki/Q5172842","display_name":"Correlation","level":2,"score":0.3018999993801117},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2957000136375427},{"id":"https://openalex.org/C106159264","wikidata":"https://www.wikidata.org/wiki/Q17146789","display_name":"Random testing","level":4,"score":0.2921000123023987},{"id":"https://openalex.org/C110121322","wikidata":"https://www.wikidata.org/wiki/Q865811","display_name":"Distribution (mathematics)","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C3018395757","wikidata":"https://www.wikidata.org/wiki/Q1379672","display_name":"Evaluation methods","level":2,"score":0.28850001096725464},{"id":"https://openalex.org/C149441793","wikidata":"https://www.wikidata.org/wiki/Q200726","display_name":"Probability distribution","level":2,"score":0.26080000400543213}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.13885","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.13885","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.13885","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.13885","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Computerized":[0],"Adaptive":[1],"Testing":[2],"(CAT)":[3],"has":[4],"proven":[5],"effective":[6],"for":[7],"efficient":[8],"LLM":[9,16],"evaluation":[10,17],"on":[11,20,60,91,120],"multiple-choice":[12],"benchmarks,":[13],"but":[14],"modern":[15],"increasingly":[18],"relies":[19],"generation":[21],"tasks":[22],"where":[23],"outputs":[24],"are":[25],"scored":[26],"continuously":[27],"rather":[28],"than":[29],"marked":[30],"correct/incorrect.":[31],"We":[32,87],"present":[33],"a":[34,55],"principled":[35],"extension":[36],"of":[37,104],"IRT-based":[38],"adaptive":[39,69],"testing":[40,78],"to":[41],"continuous":[42],"bounded":[43],"scores":[44],"(ROUGE,":[45],"BLEU,":[46],"LLM-as-a-Judge)":[47],"by":[48,111],"replacing":[49],"the":[50,105],"Bernoulli":[51],"response":[52],"distribution":[53],"with":[54,68,117],"heteroskedastic":[56],"normal":[57],"distribution.":[58],"Building":[59],"this,":[61],"we":[62],"introduce":[63],"an":[64],"uncertainty":[65],"aware":[66],"ranker":[67],"stopping":[70],"criteria":[71],"that":[72],"achieves":[73],"reliable":[74],"model":[75],"ranking":[76,109],"while":[77,107],"as":[79,83,85],"few":[80],"items":[81,106],"and":[82,97],"cheaply":[84],"possible.":[86],"validate":[88],"our":[89],"method":[90,101],"five":[92],"benchmarks":[93],"spanning":[94],"n-gram-based,":[95],"embedding-based,":[96],"LLM-as-judge":[98],"metrics.":[99],"Our":[100],"uses":[102],"2%":[103],"improving":[108],"correlation":[110],"0.12":[112],"\u03c4":[113],"over":[114],"random":[115],"sampling,":[116],"95%":[118],"accuracy":[119],"confident":[121],"predictions.":[122]},"counts_by_year":[],"updated_date":"2026-01-22T23:33:04.759266","created_date":"2026-01-22T00:00:00"}
