{"id":"https://openalex.org/W7134075290","doi":"https://doi.org/10.48550/arxiv.2603.04459","title":"Benchmark of Benchmarks: Unpacking Influence and Code Repository Quality in LLM Safety Benchmarks","display_name":"Benchmark of Benchmarks: Unpacking Influence and Code Repository Quality in LLM Safety Benchmarks","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7134075290","doi":"https://doi.org/10.48550/arxiv.2603.04459"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.04459","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128249403","display_name":"Junjie Chu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chu, Junjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102726629","display_name":"Xinyue Shen","orcid":"https://orcid.org/0009-0006-9954-587X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Xinyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128270592","display_name":"Ye Leng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leng, Ye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128227872","display_name":"Michael Backes","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Backes, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128248529","display_name":"Yun Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Yun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128229690","display_name":"Yang Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5128249403"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.07829999923706055,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.07829999923706055,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.07720000296831131,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11492","display_name":"Academic integrity and plagiarism","score":0.057100001722574234,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/unpacking","display_name":"Unpacking","score":0.9154999852180481},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7764000296592712},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.6735000014305115},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6657000184059143},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.6021999716758728},{"id":"https://openalex.org/keywords/citation","display_name":"Citation","score":0.5529000163078308},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.4047999978065491}],"concepts":[{"id":"https://openalex.org/C2777256151","wikidata":"https://www.wikidata.org/wiki/Q7897273","display_name":"Unpacking","level":2,"score":0.9154999852180481},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7764000296592712},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6740000247955322},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.6735000014305115},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6657000184059143},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.6021999716758728},{"id":"https://openalex.org/C2778805511","wikidata":"https://www.wikidata.org/wiki/Q1713","display_name":"Citation","level":2,"score":0.5529000163078308},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.4772999882698059},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.44110000133514404},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.4047999978065491},{"id":"https://openalex.org/C10272871","wikidata":"https://www.wikidata.org/wiki/Q929972","display_name":"Software inspection","level":5,"score":0.38580000400543213},{"id":"https://openalex.org/C150292731","wikidata":"https://www.wikidata.org/wiki/Q1342704","display_name":"Code review","level":5,"score":0.3725000023841858},{"id":"https://openalex.org/C184356942","wikidata":"https://www.wikidata.org/wiki/Q830382","display_name":"Best practice","level":2,"score":0.37139999866485596},{"id":"https://openalex.org/C117447612","wikidata":"https://www.wikidata.org/wiki/Q1412670","display_name":"Software quality","level":4,"score":0.353300005197525},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.3402000069618225},{"id":"https://openalex.org/C106436119","wikidata":"https://www.wikidata.org/wiki/Q836575","display_name":"Quality assurance","level":3,"score":0.3197000026702881},{"id":"https://openalex.org/C199519371","wikidata":"https://www.wikidata.org/wiki/Q942695","display_name":"Source lines of code","level":3,"score":0.29490000009536743},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.28700000047683716},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.28690001368522644}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.04459","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.04459","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.04459","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.04459","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.5776393413543701,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"rapid":[1],"expansion":[2],"of":[3,31,38,47,62,98],"research":[4],"in":[5,10],"LLM":[6,64],"safety":[7,65,180,185,219],"presents":[8],"challenges":[9],"tracking":[11],"advancements,":[12],"making":[13],"benchmarks":[14,49,66,212],"important":[15],"evaluation":[16],"infrastructures":[17],"for":[18],"identifying":[19],"key":[20],"trends":[21],"and":[22,35,71,90,111,146,159,181,236,249],"facilitating":[23],"systematic":[24,28,59],"comparisons.":[25],"Yet":[26],"no":[27,131],"assessment":[29],"exists":[30],"their":[32],"code":[33,147,152,246],"quality":[34,153],"runnability,":[36,148],"nor":[37],"what":[39],"factors":[40],"are":[41],"associated":[42],"with":[43,73,130,143,151],"the":[44,127,163],"community's":[45,164],"adoption":[46,135,141],"certain":[48],"over":[50],"others.":[51],"To":[52],"address":[53],"this":[54],"gap,":[55],"we":[56,137,177],"conduct":[57],"a":[58,78,112,238],"measurement":[60],"study":[61,128],"31":[63],"(covering":[67],"prompt":[68],"injection,":[69],"jailbreak,":[70],"hallucination)":[72],"382":[74],"non-benchmark":[75],"papers":[76,223],"as":[77,156,193,206],"control":[79],"group,":[80],"combining":[81],"automated":[82],"static":[83],"analysis,":[84],"human":[85],"runnability":[86],"testing":[87],"(220+":[88],"person-hours),":[89],"bibliometric":[91],"analysis.":[92],"We":[93,228],"find":[94,138],"that":[95,139,162],"only":[96,105],"39\\%":[97],"benchmark":[99,140,165,186,243],"repositories":[100,187],"can":[101],"run":[102],"without":[103,197],"modification,":[104],"16\\%":[106],"provide":[107],"flawless":[108],"installation":[109],"guides,":[110],"mere":[113],"6\\%":[114],"include":[115],"ethical":[116,199,250],"considerations":[117],"despite":[118],"containing":[119],"potentially":[120],"harmful":[121,190],"content.":[122],"These":[123],"deficiencies":[124],"persist":[125],"across":[126,221],"period":[129],"significant":[132],"improvement.":[133],"Analyzing":[134],"factors,":[136],"correlates":[142],"author":[144],"prominence":[145],"but":[149],"not":[150,168,225],"standards":[154],"such":[155,192],"Pylint":[157],"score":[158],"maintainability,":[160],"suggesting":[161],"selection":[166],"does":[167],"reward":[169],"higher":[170],"coding":[171],"standards.":[172],"Based":[173],"on":[174],"these":[175,233],"results,":[176],"identify":[178],"potential":[179],"reliability":[182],"concerns.":[183],"Some":[184],"openly":[188],"expose":[189],"content,":[191],"successful":[194],"jailbreak":[195],"responses,":[196],"any":[198],"warning":[200],"or":[201],"access":[202],"control,":[203],"effectively":[204],"serving":[205],"unguarded":[207],"attack":[208],"resources.":[209],"Furthermore,":[210],"when":[211],"require":[213],"ad-hoc":[214],"modifications":[215],"to":[216,241],"run,":[217],"downstream":[218],"evaluations":[220],"different":[222],"may":[224],"be":[226],"comparable.":[227],"present":[229],"case":[230],"studies":[231],"illustrating":[232],"concrete":[234],"consequences":[235],"propose":[237],"targeted":[239],"checklist":[240],"help":[242],"contributors":[244],"improve":[245],"quality,":[247],"documentation,":[248],"practices.":[251]},"counts_by_year":[],"updated_date":"2026-05-19T06:13:58.757530","created_date":"2026-03-07T00:00:00"}
