{"id":"https://openalex.org/W6892556961","doi":"https://doi.org/10.5281/zenodo.10497442","title":"CommitBench","display_name":"CommitBench","publication_year":2023,"publication_date":"2023-12-15","ids":{"openalex":"https://openalex.org/W6892556961","doi":"https://doi.org/10.5281/zenodo.10497442"},"language":"en","primary_location":{"id":"doi:10.5281/zenodo.10497442","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.10497442","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"dataset"},"type":"dataset","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.10497442","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Schall, Maximilian","orcid":"https://orcid.org/0000-0002-3943-3423"},"institutions":[{"id":"https://openalex.org/I143288331","display_name":"Hasso Plattner Institute","ror":"https://ror.org/058rn5r42","country_code":"DE","type":"facility","lineage":["https://openalex.org/I143288331","https://openalex.org/I176453806"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Schall, Maximilian","raw_affiliation_strings":["Hasso Plattner Institute"],"affiliations":[{"raw_affiliation_string":"Hasso Plattner Institute","institution_ids":["https://openalex.org/I143288331"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Czinczoll, Tamara","orcid":"https://orcid.org/0009-0007-8224-1893"},"institutions":[{"id":"https://openalex.org/I143288331","display_name":"Hasso Plattner Institute","ror":"https://ror.org/058rn5r42","country_code":"DE","type":"facility","lineage":["https://openalex.org/I143288331","https://openalex.org/I176453806"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Czinczoll, Tamara","raw_affiliation_strings":["Hasso Plattner Institute"],"affiliations":[{"raw_affiliation_string":"Hasso Plattner Institute","institution_ids":["https://openalex.org/I143288331"]}]},{"author_position":"last","author":{"id":null,"display_name":"de Melo, Gerard","orcid":"https://orcid.org/0000-0002-2930-2059"},"institutions":[{"id":"https://openalex.org/I143288331","display_name":"Hasso Plattner Institute","ror":"https://ror.org/058rn5r42","country_code":"DE","type":"facility","lineage":["https://openalex.org/I143288331","https://openalex.org/I176453806"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"de Melo, Gerard","raw_affiliation_strings":["Hasso Plattner Institute"],"affiliations":[{"raw_affiliation_string":"Hasso Plattner Institute","institution_ids":["https://openalex.org/I143288331"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I143288331"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":null,"topics":[],"keywords":[{"id":"https://openalex.org/keywords/commit","display_name":"Commit","score":0.871999979019165},{"id":"https://openalex.org/keywords/statement","display_name":"Statement (logic)","score":0.6100999712944031},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.557200014591217},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.5022000074386597},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.45089998841285706},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.4359999895095825},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.4068000018596649},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4027000069618225}],"concepts":[{"id":"https://openalex.org/C153180980","wikidata":"https://www.wikidata.org/wiki/Q19776675","display_name":"Commit","level":2,"score":0.871999979019165},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8245999813079834},{"id":"https://openalex.org/C2777026412","wikidata":"https://www.wikidata.org/wiki/Q2684591","display_name":"Statement (logic)","level":2,"score":0.6100999712944031},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.557200014591217},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.5022000074386597},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.45089998841285706},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.4359999895095825},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.4081000089645386},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.4068000018596649},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4027000069618225},{"id":"https://openalex.org/C2777561058","wikidata":"https://www.wikidata.org/wiki/Q2652119","display_name":"Program comprehension","level":4,"score":0.37929999828338623},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3695000112056732},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.314300000667572},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.3138999938964844},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3061999976634979},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3034999966621399},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3034999966621399},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.29989999532699585},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C56288433","wikidata":"https://www.wikidata.org/wiki/Q58673","display_name":"Data manipulation language","level":2,"score":0.25189998745918274},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.10497442","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.10497442","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"dataset"}],"best_oa_location":{"id":"doi:10.5281/zenodo.10497442","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.10497442","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"dataset"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Data":[0,24,32,921],"Statement":[1,25,33],"for":[2,55,130,354,375,516,567,650,773,919],"CommitBench":[3,7,45],"-":[4,8,18,23,31,37],"Dataset":[5,9,19,809],"Title:":[6],"Curator:":[10],"Maximilian":[11,27],"Schall,":[12,28],"Tamara":[13,29],"Czinczoll,":[14],"Gerard":[15],"de":[16],"Melo":[17],"Version:":[20,34],"1.0,":[21,35],"15.12.2023":[22],"Author:":[26],"Czinczoll":[30],"16.01.2023":[36],"Code":[38],"URL:":[39],"https://github.com/maxscha/commitbench":[40],"EXECUTIVE":[41],"SUMMARY":[42],"We":[43,71,145,187,272,571],"provide":[44,72,124,238,273,574],"as":[46,93,138,264,701,703,766,768],"an":[47],"open-source,":[48],"reproducible":[49,602],"and":[50,52,81,152,213,348,451,491,508,620,780,841,882,898,936,939,945],"privacy-":[51],"license-aware":[53],"benchmark":[54],"commit":[56,84,157,179,321,523,530,539,708,718],"message":[57,158,180],"generation.":[58],"The":[59,83,104,260,488,926],"dataset":[60,105,148,261,599,628,689,875],"is":[61,95,174,262,311,416,513,544,600,690,798,870],"gathered":[62],"from":[63,305,334,592,948],"github":[64],"repositories":[65],"with":[66,127,133,142,155,622,749,759],"licenses":[67],"that":[68,109,194,325,342,349,430,612,636,660,670,715,750,839,860,876],"permit":[69],"redistribution.":[70],"six":[73],"programming":[74,229,242,377,695,752],"languages,":[75,378,723],"Java,":[76],"Python,":[77],"Go,":[78],"JavaScript,":[79],"PHP":[80,396],"Ruby.":[82],"messages":[85,524,531,719],"in":[86,99,184,248,558,626,654,682,720,801,904],"natural":[87,185],"language":[88,98],"are":[89,223,432,609,637,643,698],"restricted":[90,225],"to":[91,150,175,208,217,226,234,237,244,252,332,401,438,448,457,473,518,546,580,645,786,789,847,852,879,884],"English,":[92],"it":[94,436,512,584,784],"the":[96,172,177,210,218,227,249,253,274,308,335,355,371,402,411,420,439,443,458,474,478,496,499,520,533,538,559,575,593,604,627,655,683,691,726,764,770,776,790,802,811,844,855,908,917,920,949],"working":[97],"many":[100,699],"software":[101,303,412,445,624,893],"development":[102,413],"projects.":[103],"has":[106,452,731],"1,664,590":[107],"examples":[108,663],"were":[110,482,664],"generated":[111],"by":[112,498,930,955],"using":[113,843],"extensive":[114,672],"quality-focused":[115],"filtering":[116,206,543,673],"techniques":[117,207],"(e.g.":[118],"excluding":[119],"bot":[120],"commits).":[121],"Additionally,":[122],"we":[123,222,657,667,762,821],"a":[125,140,162,196,202,258,265,504,623,647,871,874],"version":[126,141],"longer":[128],"sequences":[129],"benchmarking":[131],"models":[132],"more":[134,505],"extended":[135],"sequence":[136],"input,":[137],"well":[139,702],"CURATION":[143],"RATIONALE":[144],"created":[146],"this":[147,362,426,733,796],"due":[149,251,579],"quality":[151,212,649],"legal":[153],"issues":[154],"previous":[156,197],"generation":[159,476],"datasets.":[160],"Given":[161],"git":[163],"diff":[164],"displaying":[165],"code":[166],"changes":[167,183,247],"between":[168],"two":[169],"file":[170,268],"versions,":[171],"task":[173,250],"predict":[176],"accompanying":[178],"describing":[181],"these":[182,366,548],"language.":[186,259,338,510],"base":[188],"our":[189,542,598,641,671,688,704,829],"GitHub":[190,301,462,501,617],"repository":[191,220,804],"selection":[192,288],"on":[193,290,361,706,805,907,916],"of":[195,205,241,255,257,316,373,384,410,461,477,495,522,537,550,606,687,694,751,834,857,873],"dataset,":[198,479,656],"CodeSearchNet,":[199,291],"but":[200,406,583],"apply":[201],"large":[203,266],"number":[204,240,372,693],"improve":[209],"data":[211,363,421,440,497,578,652,765,845,868,911],"eliminate":[214],"noise.":[215],"Due":[216,400,456,472],"original":[219,500,791,803],"selection,":[221],"also":[224,340,407,528],"aforementioned":[228],"languages.":[230,753],"It":[231,339],"was":[232,913,928,946],"important":[233],"us,":[235],"however,":[236],"some":[239,555,713,744],"languages":[243,696],"accommodate":[245],"any":[246],"degree":[254],"hardware-relatedness":[256],"provides":[263,877],"CSV":[267],"containing":[269],"all":[270,306,320,568,676],"samples.":[271],"following":[275],"fields:":[276],"Diff,":[277],"Commit":[278],"Message,":[279],"Hash,":[280],"Project,":[281],"Split.":[282],"DOCUMENTATION":[283],"FOR":[284],"SOURCE":[285],"DATASETS":[286],"Repository":[287],"based":[289,915],"which":[292],"can":[293,327,527,553,585,633,861,940],"be":[294,328,449,468,529,554,586,634,712,757,862,895,902,941],"found":[295,942],"under":[296,810],"https://github.com/github/CodeSearchNet":[297],"LANGUAGE":[298],"VARIETIES":[299],"Since":[300],"hosts":[302],"projects":[304],"over":[307],"world,":[309],"there":[310,415,431,526,552,608,632],"no":[312,417,433,480],"single":[313,418],"uniform":[314],"variety":[315],"English":[317,707,740],"used":[318,353],"across":[319],"messages.":[322,709],"This":[323,837,910],"means":[324,341,838],"phrasing":[326],"regional":[329],"or":[330,535,590,734],"subject":[331],"influences":[333],"programmer's":[336],"native":[337],"different":[343,350,376],"spelling":[344],"conventions":[345],"may":[346,352],"co-exist":[347],"terms":[351],"same":[356],"concept.":[357],"Any":[358],"model":[359],"trained":[360],"should":[364],"take":[365],"factors":[367],"into":[368,825],"account.":[369],"For":[370],"samples":[374],"see":[379],"Table":[380],"below:":[381],"Language":[382],"Number":[383],"Samples":[385],"Java":[386],"153,119":[387],"Ruby":[388],"233,710":[389],"Go":[390],"137,998":[391],"JavaScript":[392],"373,598":[393],"Python":[394],"472,469":[395],"294,394":[397],"SPEAKER":[398],"DEMOGRAPHIC":[399,471],"extremely":[403],"diverse":[404],"(geographically,":[405],"socio-economically)":[408],"backgrounds":[409],"community,":[414],"demographic":[419],"comes":[422,437],"from.":[423],"Of":[424],"course,":[425],"does":[427],"not":[428,514,573,738],"entail":[429],"biases":[434,900],"when":[435],"origin.":[441],"Globally,":[442],"average":[444],"developer":[446],"tends":[447],"male":[450],"obtained":[453,587],"higher":[454],"education.":[455],"anonymous":[459],"nature":[460,490],"profiles,":[463],"gender":[464],"distribution":[465],"information":[466,772,797,859],"cannot":[467,658,668,756,831],"extracted.":[469],"ANNOTATOR":[470],"automated":[475],"annotators":[481],"used.":[483],"SPEECH":[484],"SITUATION":[485],"AND":[486,562,817],"CHARACTERISTICS":[487],"public":[489],"often":[492],"business-related":[493],"creation":[494],"users":[502,883],"fosters":[503],"neutral,":[506],"information-focused":[507],"formal":[509],"As":[511],"uncommon":[515],"developers":[517,881],"find":[519,832],"writing":[521],"tedious,":[525],"representing":[532],"frustration":[534],"boredom":[536],"author.":[540],"While":[541,597,640,820],"supposed":[545],"catch":[547,675],"types":[549],"messages,":[551],"instances":[556,635],"still":[557],"dataset.":[560,684],"PREPROCESSING":[561],"DATA":[563],"FORMATTING":[564],"See":[565],"paper":[566],"preprocessing":[569],"steps.":[570],"do":[572,737],"un-processed":[576],"raw":[577],"privacy":[581],"concerns,":[582],"via":[588],"CodeSearchNet":[589],"requested":[591],"authors.":[594],"CAPTURE":[595],"QUALITY":[596],"completely":[601],"at":[603,730,943],"time":[605],"writing,":[607],"external":[610],"dependencies":[611],"could":[613],"restrict":[614],"this.":[615],"If":[616],"shuts":[618],"down":[619],"someone":[621],"project":[625,778,781],"deletes":[629],"their":[630,721,849],"repository,":[631],"non-reproducible.":[638],"LIMITATIONS":[639],"filters":[642],"meant":[644],"ensure":[646,659],"high":[648],"each":[651],"sample":[653],"only":[661,716],"low-quality":[662,677],"removed.":[665],"Similarly,":[666],"guarantee":[669],"methods":[674],"examples.":[678],"Some":[679],"might":[680,711,890,894,901],"remain":[681],"Another":[685],"limitation":[686],"low":[692],"(there":[697],"more)":[700],"focus":[705],"There":[710],"people":[714],"write":[717],"respective":[722],"e.g.,":[724],"because":[725,735],"organization":[727],"they":[728,736],"work":[729],"established":[732],"speak":[739],"(confidently":[741],"enough).":[742],"Perhaps":[743],"languages'":[745],"syntax":[746],"better":[747,885],"aligns":[748],"These":[754],"effects":[755],"investigated":[758],"CommitBench.":[760],"Although":[761],"anonymize":[763],"far":[767],"possible,":[769],"required":[771],"reproducibility,":[774],"including":[775],"organization,":[777],"name,":[779],"hash,":[782],"makes":[783],"possible":[785],"refer":[787],"back":[788],"authoring":[792],"user":[793],"account,":[794],"since":[795],"freely":[799],"available":[800],"GitHub.":[806],"METADATA":[807],"License:":[808],"CC":[812],"BY-NC":[813],"4.0":[814],"license":[815],"DISCLOSURES":[816],"ETHICAL":[818],"REVIEW":[819],"put":[822],"substantial":[823],"effort":[824],"removing":[826],"privacy-sensitive":[827],"information,":[828],"solutions":[830],"100%":[833],"such":[835],"cases.":[836],"researchers":[840],"anyone":[842],"need":[846],"incorporate":[848],"own":[850],"safeguards":[851],"effectively":[853],"reduce":[854],"amount":[856],"personal":[858],"exposed.":[863],"ABOUT":[864],"THIS":[865],"DOCUMENT":[866],"A":[867],"statement":[869,912],"characterization":[872],"context":[878],"allow":[880],"understand":[886],"how":[887,892],"experimental":[888],"results":[889],"generalize,":[891],"appropriately":[896],"deployed,":[897],"what":[899],"reflected":[903],"systems":[905],"built":[906],"software.":[909],"written":[914],"template":[918,927,954],"Statements":[922],"Version":[923,951],"2":[924],"schema.":[925],"prepared":[929],"Angelina":[931],"McMillan-Major,":[932],"Emily":[933],"M.":[934],"Bender,":[935],"Batya":[937],"Friedman":[938],"https://techpolicylab.uw.edu/data-statements/":[944],"updated":[947],"community":[950],"1":[952],"Markdown":[953],"Leon":[956],"Dercyznski.":[957]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
