{"id":"https://openalex.org/W7125166899","doi":"https://doi.org/10.48550/arxiv.2601.11895","title":"DevBench: A Realistic, Developer-Informed Benchmark for Code Generation Models","display_name":"DevBench: A Realistic, Developer-Informed Benchmark for Code Generation Models","publication_year":2026,"publication_date":"2026-01-17","ids":{"openalex":"https://openalex.org/W7125166899","doi":"https://doi.org/10.48550/arxiv.2601.11895"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.11895","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041554597","display_name":"Pareesa Ameneh Golnari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kumarappan, Adarsh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123537391","display_name":"Adarsh Kumarappan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Golnari, Pareesa Ameneh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123525076","display_name":"Wen Wen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen, Wen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123519604","display_name":"Xiaoyu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Xiaoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002472436","display_name":"Gabriel Ryan","orcid":"https://orcid.org/0009-0003-9464-587X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ryan, Gabriel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123538284","display_name":"Yuting Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Yuting","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101020667","display_name":"Shengyu Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Shengyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123481271","display_name":"Elsie Nallipogu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nallipogu, Elsie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.4943000078201294,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.4943000078201294,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.05009999871253967,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.042500000447034836,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8317000269889832},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.574999988079071},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5616999864578247},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.5291000008583069},{"id":"https://openalex.org/keywords/code-generation","display_name":"Code generation","score":0.43070000410079956},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3610999882221222},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.33820000290870667},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3361000120639801}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8317000269889832},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8180999755859375},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.574999988079071},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5616999864578247},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.5291000008583069},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.4693000018596649},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.445499986410141},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.43070000410079956},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42989999055862427},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.3684000074863434},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3610999882221222},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.33820000290870667},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3361000120639801},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.32429999113082886},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.31679999828338623},{"id":"https://openalex.org/C9357733","wikidata":"https://www.wikidata.org/wiki/Q6878417","display_name":"Missing data","level":2,"score":0.3151000142097473},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2782999873161316},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C3019813237","wikidata":"https://www.wikidata.org/wiki/Q65089264","display_name":"Model validation","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2705000042915344},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2612000107765198},{"id":"https://openalex.org/C110332635","wikidata":"https://www.wikidata.org/wiki/Q629498","display_name":"Genetic programming","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.11895","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.11895","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.11895","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.11895","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"DevBench":[0],"is":[1,120,127],"a":[2],"telemetry-driven":[3],"benchmark":[4,93,108],"designed":[5],"to":[6,44,112],"evaluate":[7],"Large":[8],"Language":[9],"Models":[10],"(LLMs)":[11],"on":[12,74],"realistic":[13],"code":[14],"completion":[15],"tasks.":[16],"It":[17],"includes":[18],"1,800":[19],"evaluation":[20,64],"instances":[21],"across":[22],"six":[23,27],"programming":[24],"languages":[25],"and":[26,35,59,70,76,96,104,116,133],"task":[28],"categories":[29],"derived":[30],"from":[31,40,123],"real":[32],"developer":[33],"telemetry":[34],"synthesized":[36],"using":[37],"generator":[38],"models":[39,81],"multiple":[41],"provider":[42],"families":[43],"mitigate":[45],"single-source":[46],"bias.":[47],"Unlike":[48],"prior":[49],"benchmarks,":[50],"it":[51],"emphasizes":[52],"ecological":[53],"validity,":[54],"avoids":[55],"training":[56],"data":[57],"contamination,":[58],"enables":[60],"detailed":[61],"diagnostics.":[62],"The":[63],"combines":[65],"functional":[66],"correctness,":[67],"similarity-based":[68],"metrics,":[69],"LLM-judge":[71],"assessments":[72],"focused":[73],"usefulness":[75],"contextual":[77],"relevance.":[78],"9":[79],"state-of-the-art":[80],"were":[82],"assessed,":[83],"with":[84],"the":[85,92],"strongest":[86],"achieving":[87],"only":[88],"43.5%":[89],"Pass@1,":[90],"confirming":[91],"remains":[94],"challenging":[95],"revealing":[97],"differences":[98],"in":[99],"syntactic":[100],"precision,":[101],"semantic":[102],"reasoning,":[103],"practical":[105,131],"utility.":[106],"Our":[107],"provides":[109],"actionable":[110],"insights":[111],"guide":[113],"model":[114,135],"selection":[115],"improvement,":[117],"detail":[118],"that":[119],"often":[121],"missing":[122],"other":[124],"benchmarks":[125],"but":[126],"essential":[128],"for":[129],"both":[130],"deployment":[132],"targeted":[134],"development.":[136]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-01-22T00:00:00"}
