{"id":"https://openalex.org/W4416268504","doi":"https://doi.org/10.48550/arxiv.2511.04703","title":"Measuring what Matters: Construct Validity in Large Language Model Benchmarks","display_name":"Measuring what Matters: Construct Validity in Large Language Model Benchmarks","publication_year":2025,"publication_date":"2025-11-03","ids":{"openalex":"https://openalex.org/W4416268504","doi":"https://doi.org/10.48550/arxiv.2511.04703"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2511.04703","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.04703","pdf_url":"https://arxiv.org/pdf/2511.04703","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2511.04703","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001652638","display_name":"Andrew M. Bean","orcid":"https://orcid.org/0000-0001-8439-5975"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bean, Andrew M.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Kearns, Ryan Othniel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kearns, Ryan Othniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113161539","display_name":"Angelika Romanou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Romanou, Angelika","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051117078","display_name":"Franziska Sofia Hafner","orcid":"https://orcid.org/0000-0003-1070-8267"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hafner, Franziska Sofia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5094087135","display_name":"Harry Mayne","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mayne, Harry","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5117865366","display_name":"Jan Batzner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Batzner, Jan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041639724","display_name":"Negar Foroutan","orcid":"https://orcid.org/0000-0002-9640-897X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Foroutan, Negar","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120451775","display_name":"Chris Schmitz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schmitz, Chris","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093054923","display_name":"Karolina Korgul","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Korgul, Karolina","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099496976","display_name":"Hunar Batra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Batra, Hunar","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115426715","display_name":"Oishi Deb","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deb, Oishi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120404556","display_name":"Emma Beharry","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Beharry, Emma","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005232344","display_name":"Cornelius Emde","orcid":"https://orcid.org/0000-0003-0348-6109"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Emde, Cornelius","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106793224","display_name":"Thomas A Foster","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Foster, Thomas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120355935","display_name":"Anna Gausen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gausen, Anna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010801511","display_name":"Mar\u00eda Grandury","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Grandury, Mar\u00eda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112329643","display_name":"Simeng Han","orcid":"https://orcid.org/0000-0001-7238-9201"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Simeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014923355","display_name":"Valentin Hofmann","orcid":"https://orcid.org/0000-0001-6603-3428"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hofmann, Valentin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088109643","display_name":"Lujain Ibrahim","orcid":"https://orcid.org/0000-0002-0395-784X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ibrahim, Lujain","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012164347","display_name":"Hazel Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Hazel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051050719","display_name":"Hannah Rose Kirk","orcid":"https://orcid.org/0000-0002-7419-5993"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kirk, Hannah Rose","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101347509","display_name":"Fangru Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Fangru","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107941641","display_name":"G. Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Gabrielle Kaili-May","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006888957","display_name":"Lennart Luettgau","orcid":"https://orcid.org/0000-0001-9370-9606"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luettgau, Lennart","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099111164","display_name":"Jabez Magomere","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Magomere, Jabez","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119204255","display_name":"Jonathan Rystr\u00f8m","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rystr\u00f8m, Jonathan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111117835","display_name":"Anna Sotnikova","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sotnikova, Anna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Yushi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yushi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101240019","display_name":"Yilun Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yilun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091670711","display_name":"Adel Bibi","orcid":"https://orcid.org/0000-0002-6169-3918"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bibi, Adel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088410008","display_name":"Antoine Bosselut","orcid":"https://orcid.org/0000-0001-8968-9649"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bosselut, Antoine","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054998594","display_name":"Ronald Clark","orcid":"https://orcid.org/0000-0002-6344-5299"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Clark, Ronald","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064858748","display_name":"Arman Cohan","orcid":"https://orcid.org/0000-0002-8954-2724"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cohan, Arman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059094093","display_name":"Jakob Foerster","orcid":"https://orcid.org/0000-0001-9688-2498"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Foerster, Jakob","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029186201","display_name":"Yarin Gal","orcid":"https://orcid.org/0000-0002-2733-2078"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gal, Yarin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029882049","display_name":"Scott A. Hale","orcid":"https://orcid.org/0000-0002-6894-4951"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hale, Scott A.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011015504","display_name":"Inioluwa Deborah Raji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Raji, Inioluwa Deborah","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031878516","display_name":"Christopher Summerfield","orcid":"https://orcid.org/0000-0002-2941-2653"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Summerfield, Christopher","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042899882","display_name":"Philip H. S. Torr","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Torr, Philip H. S.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120651795","display_name":"Cozmin Ududec","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ududec, Cozmin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Rocher, Luc","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rocher, Luc","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5055914149","display_name":"Adam Mahdi","orcid":"https://orcid.org/0000-0002-2329-4457"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mahdi, Adam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":42,"corresponding_author_ids":["https://openalex.org/A5001652638"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.24199999868869781,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.24199999868869781,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.22460000216960907,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.07419999688863754,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.7091000080108643},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.536899983882904},{"id":"https://openalex.org/keywords/construct-validity","display_name":"Construct validity","score":0.49140000343322754},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4339999854564667},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.40049999952316284},{"id":"https://openalex.org/keywords/model-validation","display_name":"Model validation","score":0.3824000060558319}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.715399980545044},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.7091000080108643},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.536899983882904},{"id":"https://openalex.org/C49453240","wikidata":"https://www.wikidata.org/wiki/Q1592163","display_name":"Construct validity","level":3,"score":0.49140000343322754},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46959999203681946},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.44909998774528503},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4339999854564667},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.40049999952316284},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39899998903274536},{"id":"https://openalex.org/C3019813237","wikidata":"https://www.wikidata.org/wiki/Q65089264","display_name":"Model validation","level":2,"score":0.3824000060558319},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3806999921798706},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.33219999074935913},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.31769999861717224},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.3124000132083893},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3100999891757965},{"id":"https://openalex.org/C174106493","wikidata":"https://www.wikidata.org/wiki/Q1057880","display_name":"External validity","level":2,"score":0.2671000063419342},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25619998574256897}],"mesh":[],"locations_count":4,"locations":[{"id":"pmh:oai:arXiv.org:2511.04703","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.04703","pdf_url":"https://arxiv.org/pdf/2511.04703","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:kobv.de-opus4-hsog:5990","is_oa":false,"landing_page_url":"https://opus4.kobv.de/opus4-hsog/frontdoor/index/index/docId/5990","pdf_url":null,"source":{"id":"https://openalex.org/S4306402087","display_name":"OPUS 4 (Zuse Institute Berlin)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I195893171","host_organization_name":"Zuse Institute Berlin","host_organization_lineage":["https://openalex.org/I195893171"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"doc-type:workingPaper"},{"id":"pmh:oai:ora.ox.ac.uk:uuid:ad2b69b6-0986-42d0-a512-a6e56338b6cc","is_oa":false,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306402636","display_name":"Oxford University Research Archive (ORA) (University of Oxford)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I40120149","host_organization_name":"University of Oxford","host_organization_lineage":["https://openalex.org/I40120149"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Symplectic Elements","raw_type":"http://purl.org/coar/resource_type/c_5794"},{"id":"doi:10.48550/arxiv.2511.04703","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.04703","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2511.04703","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.04703","pdf_url":"https://arxiv.org/pdf/2511.04703","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Evaluating":[0],"large":[1],"language":[2,68],"models":[3],"(LLMs)":[4],"is":[5],"crucial":[6],"for":[7],"both":[8],"assessing":[9],"their":[10],"capabilities":[11],"and":[12,24,30,70,86,106,112],"identifying":[13],"safety":[14],"or":[15],"robustness":[16],"issues":[17],"prior":[18],"to":[19,44,81,110],"deployment.":[20],"Reliably":[21],"measuring":[22],"abstract":[23],"complex":[25],"phenomena":[26],"such":[27],"as":[28],"'safety'":[29],"'robustness'":[31],"requires":[32],"strong":[33],"construct":[34],"validity,":[35],"that":[36,40],"is,":[37],"having":[38],"measures":[39],"represent":[41],"what":[42],"matters":[43],"the":[45,74,82,91,94],"phenomenon.":[46],"With":[47],"a":[48,56],"team":[49],"of":[50,59,93],"29":[51],"expert":[52],"reviewers,":[53],"we":[54,77,101],"conduct":[55],"systematic":[57],"review":[58],"445":[60],"LLM":[61,116],"benchmarks":[62],"from":[63],"leading":[64],"conferences":[65],"in":[66,114],"natural":[67],"processing":[69],"machine":[71],"learning.":[72],"Across":[73],"reviewed":[75],"articles,":[76],"find":[78],"patterns":[79],"related":[80],"measured":[83],"phenomena,":[84],"tasks,":[85],"scoring":[87],"metrics":[88],"which":[89],"undermine":[90],"validity":[92],"resulting":[95],"claims.":[96],"To":[97],"address":[98],"these":[99],"shortcomings,":[100],"provide":[102],"eight":[103],"key":[104],"recommendations":[105],"detailed":[107],"actionable":[108],"guidance":[109],"researchers":[111],"practitioners":[113],"developing":[115],"benchmarks.":[117]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-05-04T08:30:34.212998","created_date":"2025-11-11T00:00:00"}
