{"id":"https://openalex.org/W4417439098","doi":"https://doi.org/10.1109/tse.2025.3644183","title":"Benchmarking AI Models in Software Engineering: A Review, Search Tool, and Unified Approach for Elevating Benchmark Quality","display_name":"Benchmarking AI Models in Software Engineering: A Review, Search Tool, and Unified Approach for Elevating Benchmark Quality","publication_year":2025,"publication_date":"2025-12-17","ids":{"openalex":"https://openalex.org/W4417439098","doi":"https://doi.org/10.1109/tse.2025.3644183"},"language":null,"primary_location":{"id":"doi:10.1109/tse.2025.3644183","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tse.2025.3644183","pdf_url":null,"source":{"id":"https://openalex.org/S8351582","display_name":"IEEE Transactions on Software Engineering","issn_l":"0098-5589","issn":["0098-5589","1939-3520","2326-3881"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Software Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5115454624","display_name":"Roham Koohestani","orcid":"https://orcid.org/0009-0000-1649-9596"},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Roham Koohestani","raw_affiliation_strings":["Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands","EEMCS faculty, Delft University of Technology, Netherlands"],"raw_orcid":"https://orcid.org/0009-0000-1649-9596","affiliations":[{"raw_affiliation_string":"Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands","institution_ids":["https://openalex.org/I98358874"]},{"raw_affiliation_string":"EEMCS faculty, Delft University of Technology, Netherlands","institution_ids":["https://openalex.org/I98358874"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053983805","display_name":"Philippe de Bekker","orcid":"https://orcid.org/0000-0002-3722-5428"},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Philippe de Bekker","raw_affiliation_strings":["Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands","EEMCS faculty, Delft University of Technology, Netherlands"],"raw_orcid":"https://orcid.org/0000-0002-3722-5428","affiliations":[{"raw_affiliation_string":"Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands","institution_ids":["https://openalex.org/I98358874"]},{"raw_affiliation_string":"EEMCS faculty, Delft University of Technology, Netherlands","institution_ids":["https://openalex.org/I98358874"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5118509098","display_name":"Beg\u00fcm Ko\u00e7","orcid":null},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Beg\u00fcm Ko\u00e7","raw_affiliation_strings":["Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands","EEMCS faculty, Delft University of Technology, Netherlands"],"raw_orcid":"https://orcid.org/0009-0000-6686-6008","affiliations":[{"raw_affiliation_string":"Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands","institution_ids":["https://openalex.org/I98358874"]},{"raw_affiliation_string":"EEMCS faculty, Delft University of Technology, Netherlands","institution_ids":["https://openalex.org/I98358874"]}]},{"author_position":"last","author":{"id":null,"display_name":"Maliheh Izadi","orcid":"https://orcid.org/0000-0001-5093-5523"},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Maliheh Izadi","raw_affiliation_strings":["Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands","EEMCS faculty, Delft University of Technology, Netherlands"],"raw_orcid":"https://orcid.org/0000-0001-5093-5523","affiliations":[{"raw_affiliation_string":"Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands","institution_ids":["https://openalex.org/I98358874"]},{"raw_affiliation_string":"EEMCS faculty, Delft University of Technology, Netherlands","institution_ids":["https://openalex.org/I98358874"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5115454624"],"corresponding_institution_ids":["https://openalex.org/I98358874"],"apc_list":null,"apc_paid":null,"fwci":2.9051,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.93971594,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"52","issue":"2","first_page":"651","last_page":"674"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.6942999958992004,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.6942999958992004,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10430","display_name":"Software Engineering Techniques and Practices","score":0.12240000069141388,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.056299999356269836,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.8981000185012817},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8320000171661377},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.552299976348877},{"id":"https://openalex.org/keywords/generalizability-theory","display_name":"Generalizability theory","score":0.5026000142097473},{"id":"https://openalex.org/keywords/curse-of-dimensionality","display_name":"Curse of dimensionality","score":0.45739999413490295},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4498000144958496},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.4377000033855438},{"id":"https://openalex.org/keywords/standardization","display_name":"Standardization","score":0.4106000065803528},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.41019999980926514},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.38429999351501465}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.8981000185012817},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8693000078201294},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8320000171661377},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.552299976348877},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.507099986076355},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.5026000142097473},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4950000047683716},{"id":"https://openalex.org/C111030470","wikidata":"https://www.wikidata.org/wiki/Q1430460","display_name":"Curse of dimensionality","level":2,"score":0.45739999413490295},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4498000144958496},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.4377000033855438},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4219000041484833},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4117000102996826},{"id":"https://openalex.org/C188087704","wikidata":"https://www.wikidata.org/wiki/Q369577","display_name":"Standardization","level":2,"score":0.4106000065803528},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.41019999980926514},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.38429999351501465},{"id":"https://openalex.org/C117447612","wikidata":"https://www.wikidata.org/wiki/Q1412670","display_name":"Software quality","level":4,"score":0.3799999952316284},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3646000027656555},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.3434999883174896},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.32739999890327454},{"id":"https://openalex.org/C82214349","wikidata":"https://www.wikidata.org/wiki/Q657339","display_name":"Software metric","level":5,"score":0.3163999915122986},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.3154999911785126},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.3025999963283539},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C529173508","wikidata":"https://www.wikidata.org/wiki/Q638608","display_name":"Software development","level":3,"score":0.2888000011444092},{"id":"https://openalex.org/C137287247","wikidata":"https://www.wikidata.org/wiki/Q1329550","display_name":"Static program analysis","level":4,"score":0.28060001134872437},{"id":"https://openalex.org/C139143892","wikidata":"https://www.wikidata.org/wiki/Q7441615","display_name":"Search-based software engineering","level":5,"score":0.2743000090122223},{"id":"https://openalex.org/C32833848","wikidata":"https://www.wikidata.org/wiki/Q4115054","display_name":"Extensibility","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C151552104","wikidata":"https://www.wikidata.org/wiki/Q7705809","display_name":"Test suite","level":4,"score":0.2705000042915344},{"id":"https://openalex.org/C128942645","wikidata":"https://www.wikidata.org/wiki/Q1568346","display_name":"Test case","level":3,"score":0.2635999917984009},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.26330000162124634},{"id":"https://openalex.org/C509989072","wikidata":"https://www.wikidata.org/wiki/Q15188241","display_name":"Model-driven architecture","level":4,"score":0.26170000433921814},{"id":"https://openalex.org/C168065819","wikidata":"https://www.wikidata.org/wiki/Q845566","display_name":"Debugging","level":2,"score":0.25850000977516174}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tse.2025.3644183","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tse.2025.3644183","pdf_url":null,"source":{"id":"https://openalex.org/S8351582","display_name":"IEEE Transactions on Software Engineering","issn_l":"0098-5589","issn":["0098-5589","1939-3520","2326-3881"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Software Engineering","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":122,"referenced_works":["https://openalex.org/W841012168","https://openalex.org/W2156723666","https://openalex.org/W2242083635","https://openalex.org/W2762550985","https://openalex.org/W2788306232","https://openalex.org/W2794992746","https://openalex.org/W2807964941","https://openalex.org/W2884276923","https://openalex.org/W2888017562","https://openalex.org/W2890431379","https://openalex.org/W2949297108","https://openalex.org/W2959219726","https://openalex.org/W2962936887","https://openalex.org/W2963477458","https://openalex.org/W2964194820","https://openalex.org/W2964315653","https://openalex.org/W2970442801","https://openalex.org/W2970605129","https://openalex.org/W2982275082","https://openalex.org/W2998870469","https://openalex.org/W3005628256","https://openalex.org/W3091588759","https://openalex.org/W3097329333","https://openalex.org/W3105943882","https://openalex.org/W3163206498","https://openalex.org/W3170978252","https://openalex.org/W3173383004","https://openalex.org/W3173493489","https://openalex.org/W3174145130","https://openalex.org/W3175818566","https://openalex.org/W3183469243","https://openalex.org/W3184995367","https://openalex.org/W3199077625","https://openalex.org/W4220783184","https://openalex.org/W4283799640","https://openalex.org/W4284665614","https://openalex.org/W4284669642","https://openalex.org/W4285600327","https://openalex.org/W4308627645","https://openalex.org/W4308643152","https://openalex.org/W4311887664","https://openalex.org/W4312247631","https://openalex.org/W4383215759","https://openalex.org/W4384026520","https://openalex.org/W4384345646","https://openalex.org/W4384345745","https://openalex.org/W4385565416","https://openalex.org/W4385567003","https://openalex.org/W4385572906","https://openalex.org/W4385574207","https://openalex.org/W4387869001","https://openalex.org/W4387913122","https://openalex.org/W4388212317","https://openalex.org/W4389159862","https://openalex.org/W4389215044","https://openalex.org/W4389518608","https://openalex.org/W4389518960","https://openalex.org/W4389519225","https://openalex.org/W4391558404","https://openalex.org/W4392182010","https://openalex.org/W4392414327","https://openalex.org/W4392425548","https://openalex.org/W4393152795","https://openalex.org/W4394744221","https://openalex.org/W4395704141","https://openalex.org/W4398239265","https://openalex.org/W4399577158","https://openalex.org/W4399668074","https://openalex.org/W4400112643","https://openalex.org/W4400140877","https://openalex.org/W4400582976","https://openalex.org/W4400681670","https://openalex.org/W4401906817","https://openalex.org/W4402442714","https://openalex.org/W4402443006","https://openalex.org/W4402457600","https://openalex.org/W4402665833","https://openalex.org/W4402670434","https://openalex.org/W4402670898","https://openalex.org/W4402671203","https://openalex.org/W4402671827","https://openalex.org/W4402671956","https://openalex.org/W4402683831","https://openalex.org/W4402684113","https://openalex.org/W4402684335","https://openalex.org/W4403210636","https://openalex.org/W4403486628","https://openalex.org/W4403536388","https://openalex.org/W4404697012","https://openalex.org/W4404792894","https://openalex.org/W4407375771","https://openalex.org/W4409362569","https://openalex.org/W4409362653","https://openalex.org/W4410537502","https://openalex.org/W4410537769","https://openalex.org/W4411113098","https://openalex.org/W4411119259","https://openalex.org/W4411119411","https://openalex.org/W4411233264","https://openalex.org/W4411233323","https://openalex.org/W4411271580","https://openalex.org/W4411450081","https://openalex.org/W4411522841","https://openalex.org/W4411523125","https://openalex.org/W4411552165","https://openalex.org/W4411950637","https://openalex.org/W4411950693","https://openalex.org/W4412886956","https://openalex.org/W4412887745","https://openalex.org/W4412888311","https://openalex.org/W4412888419","https://openalex.org/W4412888458","https://openalex.org/W4412888714","https://openalex.org/W4412944815","https://openalex.org/W4412945055","https://openalex.org/W4412945128","https://openalex.org/W4413978227","https://openalex.org/W4414363390","https://openalex.org/W4416035104","https://openalex.org/W4417439109","https://openalex.org/W7125895518","https://openalex.org/W7126448819"],"related_works":[],"abstract_inverted_index":{"Benchmarks":[0],"are":[1],"essential":[2],"for":[3,14,22,74,81,122,217],"unified":[4,79,170],"evaluation":[5],"and":[6,28,58,77,104,151,157,192,203,211,230,249],"reproducibility.":[7],"The":[8],"rapid":[9],"rise":[10],"of":[11,53,90,133,154,160,209,244],"Artificial":[12],"Intelligence":[13],"Software":[15],"Engineering":[16],"(AI4SE)":[17],"has":[18,34],"produced":[19],"numerous":[20],"benchmarks":[21,73,96],"tasks":[23],"such":[24],"as":[25],"code":[26,198],"generation":[27],"bug":[29],"repair.":[30],"However,":[31],"this":[32],"proliferation":[33],"led":[35],"to":[36,172,178],"major":[37],"challenges:":[38],"(1)":[39],"fragmented":[40],"knowledge":[41],"across":[42],"tasks,":[43],"(2)":[44],"difficulty":[45],"in":[46,55,107],"selecting":[47],"contextually":[48],"relevant":[49],"benchmarks,":[50],"(3)":[51],"lack":[52],"standardization":[54],"benchmark":[56,84,174,219],"creation,":[57],"(4)":[59],"flaws":[60],"that":[61],"limit":[62],"utility.":[63],"Addressing":[64],"these":[65,112],"requires":[66],"a":[67,88,141,169],"dual":[68],"approach:":[69],"systematically":[70],"mapping":[71],"existing":[72],"informed":[75],"selection":[76],"defining":[78],"guidelines":[80],"robust,":[82],"adaptable":[83],"development.":[85],"We":[86,99,221],"conduct":[87],"review":[89],"247":[91],"studies,":[92,135],"identifying":[93],"273":[94],"AI4SE":[95],"since":[97],"2014.":[98],"categorize":[100],"them,":[101],"analyze":[102],"limitations,":[103],"expose":[105],"gaps":[106],"current":[108],"practices.":[109],"Building":[110],"on":[111,200,234],"insights,":[113],"we":[114,166,239],"introduce":[115],"BenchScout,":[116],"an":[117,227],"extensible":[118],"semantic":[119],"search":[120],"tool":[121],"locating":[123],"suitable":[124],"benchmarks.":[125],"BenchScout":[126,147],"employs":[127],"automated":[128],"clustering":[129],"with":[130,144],"contextual":[131],"embeddings":[132],"benchmark-related":[134],"followed":[136],"by":[137],"dimensionality":[138],"reduction.":[139],"In":[140],"user":[142,247],"study":[143],"22":[145],"participants,":[146],"achieved":[148],"usability,":[149],"effectiveness,":[150],"intuitiveness":[152],"scores":[153],"4.5,":[155],"4.0,":[156],"4.1":[158],"out":[159],"5.":[161],"To":[162],"improve":[163,173],"benchmarking":[164],"standards,":[165],"propose":[167],"BenchFrame,":[168],"approach":[171],"quality.":[175],"Applying":[176],"Bench-Frame":[177],"HumanEval":[179],"yielded":[180],"HumanEvalNext,":[181],"which":[182],"features":[183],"corrected":[184],"errors,":[185],"improved":[186],"language":[187],"conversion,":[188],"higher":[189],"test":[190],"coverage,":[191],"greater":[193],"difficulty.":[194],"Evaluating":[195],"10":[196],"state-of-the-art":[197],"models":[199],"HumanEval,":[201],"HumanEvalPlus,":[202],"HumanEvalNext":[204],"revealed":[205],"average":[206],"pass-at-1":[207],"drops":[208],"31.22%":[210],"19.94%,":[212],"respectively,":[213],"underscoring":[214],"the":[215,235,242,250],"need":[216],"continuous":[218],"refinement.":[220],"further":[222],"examine":[223],"BenchFrame\u2019s":[224],"scalability":[225],"through":[226],"agentic":[228],"pipeline":[229],"confirm":[231],"its":[232],"generalizability":[233],"MBPP":[236],"dataset.":[237],"Lastly,":[238],"publicly":[240],"release":[241],"material":[243],"our":[245],"review,":[246],"study,":[248],"enhanced":[251],"benchmark.<sup":[252],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[253],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[254]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-02-13T13:36:01.753593","created_date":"2025-12-17T00:00:00"}
