{"id":"https://openalex.org/W4410342726","doi":"https://doi.org/10.1109/tai.2025.3569516","title":"Inadequacies of Large Language Model Benchmarks in the Era of Generative Artificial Intelligence","display_name":"Inadequacies of Large Language Model Benchmarks in the Era of Generative Artificial Intelligence","publication_year":2025,"publication_date":"2025-05-13","ids":{"openalex":"https://openalex.org/W4410342726","doi":"https://doi.org/10.1109/tai.2025.3569516"},"language":"en","primary_location":{"id":"doi:10.1109/tai.2025.3569516","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tai.2025.3569516","pdf_url":null,"source":{"id":"https://openalex.org/S4210169448","display_name":"IEEE Transactions on Artificial Intelligence","issn_l":"2691-4581","issn":["2691-4581"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024454231","display_name":"Timothy R. McIntosh","orcid":"https://orcid.org/0000-0003-0836-4266"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Timothy R. McIntosh","raw_affiliation_strings":["Cyberoo Pty Ltd., Surrey Hills, NSW, Australia","Cyberoo Pty Ltd, Surrey Hills, NSW, Australia"],"affiliations":[{"raw_affiliation_string":"Cyberoo Pty Ltd., Surrey Hills, NSW, Australia","institution_ids":[]},{"raw_affiliation_string":"Cyberoo Pty Ltd, Surrey Hills, NSW, Australia","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037915797","display_name":"Teo Su\u0161njak","orcid":"https://orcid.org/0000-0001-9416-1435"},"institutions":[{"id":"https://openalex.org/I51158804","display_name":"Massey University","ror":"https://ror.org/052czxv31","country_code":"NZ","type":"education","lineage":["https://openalex.org/I51158804"]}],"countries":["NZ"],"is_corresponding":false,"raw_author_name":"Teo Susnjak","raw_affiliation_strings":["Massey University, Auckland, New Zealand"],"affiliations":[{"raw_affiliation_string":"Massey University, Auckland, New Zealand","institution_ids":["https://openalex.org/I51158804"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081069489","display_name":"Nalin Asanka Gamagedara Arachchilage","orcid":"https://orcid.org/0000-0002-0059-0376"},"institutions":[{"id":"https://openalex.org/I82951845","display_name":"RMIT University","ror":"https://ror.org/04ttjf776","country_code":"AU","type":"education","lineage":["https://openalex.org/I82951845"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Nalin Arachchilage","raw_affiliation_strings":["RMIT University, Melbourne, VIC, Australia"],"affiliations":[{"raw_affiliation_string":"RMIT University, Melbourne, VIC, Australia","institution_ids":["https://openalex.org/I82951845"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100392689","display_name":"Tong Liu","orcid":"https://orcid.org/0000-0003-3047-1148"},"institutions":[{"id":"https://openalex.org/I51158804","display_name":"Massey University","ror":"https://ror.org/052czxv31","country_code":"NZ","type":"education","lineage":["https://openalex.org/I51158804"]}],"countries":["NZ"],"is_corresponding":false,"raw_author_name":"Tong Liu","raw_affiliation_strings":["Massey University, Auckland, New Zealand"],"affiliations":[{"raw_affiliation_string":"Massey University, Auckland, New Zealand","institution_ids":["https://openalex.org/I51158804"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052950957","display_name":"Dan Xu","orcid":"https://orcid.org/0009-0004-3930-7381"},"institutions":[{"id":"https://openalex.org/I82951845","display_name":"RMIT University","ror":"https://ror.org/04ttjf776","country_code":"AU","type":"education","lineage":["https://openalex.org/I82951845"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Dan Xu","raw_affiliation_strings":["RMIT University, Melbourne, VIC, Australia"],"affiliations":[{"raw_affiliation_string":"RMIT University, Melbourne, VIC, Australia","institution_ids":["https://openalex.org/I82951845"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074086920","display_name":"Paul Watters","orcid":"https://orcid.org/0000-0002-1399-7175"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Paul Watters","raw_affiliation_strings":["Cyberstronomy Pty Ltd., Ballarat, VIC, Australia","Cyberstronomy Pty Ltd, Ballarat, VIC, Australia"],"affiliations":[{"raw_affiliation_string":"Cyberstronomy Pty Ltd., Ballarat, VIC, Australia","institution_ids":[]},{"raw_affiliation_string":"Cyberstronomy Pty Ltd, Ballarat, VIC, Australia","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053757316","display_name":"Malka N. Halgamuge","orcid":"https://orcid.org/0000-0001-9994-3778"},"institutions":[{"id":"https://openalex.org/I82951845","display_name":"RMIT University","ror":"https://ror.org/04ttjf776","country_code":"AU","type":"education","lineage":["https://openalex.org/I82951845"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Malka N. Halgamuge","raw_affiliation_strings":["RMIT University, Melbourne, VIC, Australia"],"affiliations":[{"raw_affiliation_string":"RMIT University, Melbourne, VIC, Australia","institution_ids":["https://openalex.org/I82951845"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5024454231"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":86.7311,"has_fulltext":false,"cited_by_count":42,"citation_normalized_percentile":{"value":0.99934407,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"7","issue":"1","first_page":"22","last_page":"39"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.33799999952316284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.33799999952316284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.7708312273025513},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5158947706222534},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4896472096443176},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4414966404438019},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.4288254380226135},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3870023488998413},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.3581554889678955},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.23409581184387207},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.12364831566810608}],"concepts":[{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.7708312273025513},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5158947706222534},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4896472096443176},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4414966404438019},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.4288254380226135},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3870023488998413},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.3581554889678955},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.23409581184387207},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.12364831566810608}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tai.2025.3569516","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tai.2025.3569516","pdf_url":null,"source":{"id":"https://openalex.org/S4210169448","display_name":"IEEE Transactions on Artificial Intelligence","issn_l":"2691-4581","issn":["2691-4581"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W1979264048","https://openalex.org/W2040953484","https://openalex.org/W2763339398","https://openalex.org/W2923014074","https://openalex.org/W2958385005","https://openalex.org/W2963928582","https://openalex.org/W2968091587","https://openalex.org/W2998903229","https://openalex.org/W3048498988","https://openalex.org/W3148880536","https://openalex.org/W3179862281","https://openalex.org/W3205163562","https://openalex.org/W4206578767","https://openalex.org/W4286307885","https://openalex.org/W4309674289","https://openalex.org/W4366548330","https://openalex.org/W4382246105","https://openalex.org/W4384071683","https://openalex.org/W4385572697","https://openalex.org/W4386607580","https://openalex.org/W4388691793","https://openalex.org/W4388720055","https://openalex.org/W4388886073","https://openalex.org/W4389518608","https://openalex.org/W4389518784","https://openalex.org/W4391136507","https://openalex.org/W4392223637","https://openalex.org/W4392908117","https://openalex.org/W4393147065","https://openalex.org/W4393147120","https://openalex.org/W4393277515","https://openalex.org/W4396608701","https://openalex.org/W4399285823","https://openalex.org/W4399365040","https://openalex.org/W4400530529","https://openalex.org/W4401042689","https://openalex.org/W4401863475","https://openalex.org/W4402670899","https://openalex.org/W4404783604","https://openalex.org/W4406984555","https://openalex.org/W4411630291"],"related_works":["https://openalex.org/W4365211920","https://openalex.org/W3014948380","https://openalex.org/W4391584540","https://openalex.org/W4380551139","https://openalex.org/W4317695495","https://openalex.org/W4395044357","https://openalex.org/W4287117424","https://openalex.org/W4387506531","https://openalex.org/W2087346071","https://openalex.org/W2967848559"],"abstract_inverted_index":{"The":[0],"rapid":[1],"rise":[2],"in":[3,35,80,100,118,155],"popularity":[4],"of":[5,59,67,95,120,162,168,175],"Large":[6],"Language":[7],"Models":[8],"(LLMs)":[9],"with":[10],"emerging":[11],"capabilities":[12],"has":[13],"spurred":[14],"public":[15],"curiosity":[16],"to":[17,26,43,133,137],"evaluate":[18],"and":[19,62,70,92,97,115,143,172],"compare":[20],"different":[21],"LLMs,":[22],"leading":[23],"many":[24],"researchers":[25],"propose":[27],"their":[28],"own":[29],"LLM":[30,48,156],"benchmarks.":[31],"Noticing":[32],"preliminary":[33],"inadequacies":[34],"those":[36],"benchmarks,":[37,49],"we":[38],"embarked":[39],"on":[40],"a":[41,152],"study":[42,147],"critically":[44],"assess":[45],"23":[46],"state-of-the-art":[47],"using":[50],"our":[51],"novel":[52],"unified":[53],"evaluation":[54,157],"framework":[55],"through":[56],"the":[57,65,93,107,149,160,166,173],"lenses":[58],"people,":[60],"process,":[61],"technology,":[63],"under":[64],"pillars":[66],"benchmark":[68],"functionality":[69],"integrity.":[71],"Our":[72,104,146],"research":[73],"uncovered":[74],"significant":[75],"limitations,":[76],"including":[77,125],"biases,":[78],"difficulties":[79],"measuring":[81],"genuine":[82],"reasoning,":[83],"adaptability,":[84],"implementation":[85],"inconsistencies,":[86],"prompt":[87],"engineering":[88],"complexity,":[89],"evaluator":[90],"diversity,":[91],"overlooking":[94],"cultural":[96],"ideological":[98],"norms":[99],"one":[101],"comprehensive":[102],"assessment.":[103],"discussions":[105],"emphasized":[106],"urgent":[108],"need":[109],"for":[110,127,151,165],"standardized":[111],"methodologies,":[112,158],"regulatory":[113],"certainties,":[114],"ethical":[116],"guidelines":[117],"light":[119],"Artificial":[121],"Intelligence":[122],"(AI)":[123],"advancements,":[124],"advocating":[126],"an":[128],"evolution":[129],"from":[130],"static":[131],"benchmarks":[132,171],"dynamic":[134],"behavioral":[135],"profiling":[136],"accurately":[138],"capture":[139],"LLMs\u2019":[140],"complex":[141],"behaviors":[142],"potential":[144],"risks.":[145],"highlighted":[148],"necessity":[150],"paradigm":[153],"shift":[154],"underlining":[159],"importance":[161],"collaborative":[163],"efforts":[164],"development":[167],"universally":[169],"accepted":[170],"enhancement":[174],"AI":[176],"systems\u2019":[177],"integration":[178],"into":[179],"society.":[180]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":26},{"year":2024,"cited_by_count":11}],"updated_date":"2026-03-09T08:58:05.943551","created_date":"2025-10-10T00:00:00"}
