{"id":"https://openalex.org/W7148865587","doi":"https://doi.org/10.48550/arxiv.2604.01418","title":"Cost-Efficient Estimation of General Abilities Across Benchmarks","display_name":"Cost-Efficient Estimation of General Abilities Across Benchmarks","publication_year":2026,"publication_date":"2026-04-01","ids":{"openalex":"https://openalex.org/W7148865587","doi":"https://doi.org/10.48550/arxiv.2604.01418"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.01418","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.01418","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.01418","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073663407","display_name":"Michael Krumdick","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Krumdick, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012580224","display_name":"Adam Wiemerslage","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wiemerslage, Adam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036823356","display_name":"Seth Ebner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ebner, Seth","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018940839","display_name":"Charles Lovering","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lovering, Charles","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5090162434","display_name":"Chris C. Tanner","orcid":"https://orcid.org/0000-0001-9343-5190"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tanner, Chris","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.24580000340938568,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.24580000340938568,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10467","display_name":"Psychometric Methodologies and Testing","score":0.15379999577999115,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.05939999967813492,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.829800009727478},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6969000101089478},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.5906000137329102},{"id":"https://openalex.org/keywords/measure","display_name":"Measure (data warehouse)","score":0.579200029373169},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5733000040054321},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5121999979019165},{"id":"https://openalex.org/keywords/item-response-theory","display_name":"Item response theory","score":0.36959999799728394},{"id":"https://openalex.org/keywords/estimation","display_name":"Estimation","score":0.3418000042438507}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.829800009727478},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7771999835968018},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6969000101089478},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.652899980545044},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.5906000137329102},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.579200029373169},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5733000040054321},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5304999947547913},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5121999979019165},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.41449999809265137},{"id":"https://openalex.org/C19875794","wikidata":"https://www.wikidata.org/wiki/Q1207340","display_name":"Item response theory","level":3,"score":0.36959999799728394},{"id":"https://openalex.org/C96250715","wikidata":"https://www.wikidata.org/wiki/Q965330","display_name":"Estimation","level":2,"score":0.3418000042438507},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.3386000096797943},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.33480000495910645},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.3199000060558319},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.26190000772476196},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C167085575","wikidata":"https://www.wikidata.org/wiki/Q6803654","display_name":"Mean squared prediction error","level":2,"score":0.25440001487731934},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.2531000077724457}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.01418","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.01418","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.01418","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.01418","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Thousands":[0],"of":[1,11,32,55,67,81,101,107,127,141,183],"diverse":[2,139],"benchmarks":[3],"have":[4],"been":[5],"developed":[6],"to":[7,51,214,221],"measure":[8],"the":[9,39,53,65,79,93,124,210],"quality":[10,54,66],"large":[12],"language":[13],"models":[14,109],"(LLMs).":[15],"Yet":[16],"prior":[17],"work":[18],"has":[19],"demonstrated":[20],"that":[21,64,150,199],"LLM":[22],"performance":[23,83,135,171],"is":[24],"often":[25],"sufficiently":[26],"explained":[27],"by":[28,59,165],"a":[29,68,99,133,137,152,178],"small":[30],"set":[31],"latent":[33],"factors,":[34],"or":[35],"abilities.":[36],"This":[37,121],"suggests":[38],"potential":[40],"for":[41],"more":[42],"efficient":[43],"and":[44,187],"principled":[45],"benchmarking,":[46],"but":[47],"it":[48,77],"remains":[49],"difficult":[50],"compare":[52],"different":[56,129,145],"methods.":[57],"Motivated":[58],"predictive":[60],"validity,":[61],"we":[62,91],"argue":[63],"benchmarking":[69],"framework":[70],"should":[71],"be":[72],"grounded":[73],"in":[74,227],"how":[75,128],"efficiently":[76],"enables":[78,123],"prediction":[80],"model":[82,159],"on":[84,110,136,172],"unseen":[85,142],"tasks.":[86],"To":[87],"analyze":[88],"this":[89],"objective,":[90],"collect":[92],"\"Wide-scale":[94],"Item":[95],"Level":[96],"Dataset\"":[97],"(WILD),":[98],"dataset":[100,122],"item-model":[102],"response":[103,156],"pairs,":[104],"comprising":[105],"evaluations":[106],"65":[108],"109,564":[111],"unique":[112],"items":[113],"spanning":[114],"163":[115],"tasks":[116,143,176],"drawn":[117],"from":[118,218],"27":[119],"datasets.":[120],"first":[125],"analysis":[126],"techniques":[130],"can":[131,169,188,208],"predict":[132,170],"model's":[134],"large,":[138],"collection":[140],"under":[144],"budget":[146],"constraints.":[147],"We":[148,196],"demonstrate":[149,198],"combining":[151],"modified":[153],"multidimensional":[154],"item":[155,162],"theory":[157],"(IRT)":[158],"with":[160,177],"adaptive":[161],"selection":[163,206],"driven":[164],"optimal":[166],"experimental":[167],"design":[168],"112":[173],"held-out":[174],"benchmark":[175],"mean":[179],"absolute":[180],"error":[181],"(MAE)":[182],"less":[184],"than":[185],"7%,":[186],"do":[189],"so":[190],"after":[191],"observing":[192],"only":[193,222],"16":[194],"items.":[195],"further":[197],"incorporating":[200],"cost-aware":[201],"discount":[202],"factors":[203],"into":[204],"our":[205],"criteria":[207],"reduce":[209],"total":[211],"tokens":[212,220],"needed":[213],"reach":[215],"7%":[216],"MAE":[217],"141,000":[219],"22,000,":[223],"an":[224],"85%":[225],"reduction":[226],"evaluation":[228],"cost.":[229]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-04T00:00:00"}
