{"id":"https://openalex.org/W7108676090","doi":"https://doi.org/10.5281/zenodo.17811410","title":"The Rhythm In Anything: Audio-Prompted Drums Generation with Masked Language Modeling","display_name":"The Rhythm In Anything: Audio-Prompted Drums Generation with Masked Language Modeling","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7108676090","doi":"https://doi.org/10.5281/zenodo.17811410"},"language":null,"primary_location":{"id":"doi:10.5281/zenodo.17811410","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811410","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.17811410","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Patrick O'Reilly","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patrick O'Reilly","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Julia Barnett","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Julia Barnett","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hugo Flores Garcia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hugo Flores Garcia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Annie Chu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Annie Chu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Nathan Pruyne","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nathan Pruyne","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Prem Seetharaman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Prem Seetharaman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Bryan Pardo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bryan Pardo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.53716593,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.8220999836921692,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.8220999836921692,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.08940000087022781,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.023399999365210533,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rhythm","display_name":"Rhythm","score":0.9172000288963318},{"id":"https://openalex.org/keywords/gesture","display_name":"Gesture","score":0.7828999757766724},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.44190001487731934},{"id":"https://openalex.org/keywords/sound","display_name":"Sound (geography)","score":0.4318999946117401},{"id":"https://openalex.org/keywords/drum","display_name":"Drum","score":0.38989999890327454},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3345000147819519}],"concepts":[{"id":"https://openalex.org/C135343436","wikidata":"https://www.wikidata.org/wiki/Q170406","display_name":"Rhythm","level":2,"score":0.9172000288963318},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.7828999757766724},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5697000026702881},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5049999952316284},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.46299999952316284},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.44190001487731934},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.4318999946117401},{"id":"https://openalex.org/C175845324","wikidata":"https://www.wikidata.org/wiki/Q2738285","display_name":"Drum","level":2,"score":0.38989999890327454},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3345000147819519},{"id":"https://openalex.org/C112758219","wikidata":"https://www.wikidata.org/wiki/Q16038819","display_name":"Duration (music)","level":2,"score":0.32829999923706055},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.31279999017715454},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C2780226923","wikidata":"https://www.wikidata.org/wiki/Q929848","display_name":"Movement (music)","level":2,"score":0.2818000018596649},{"id":"https://openalex.org/C2983311337","wikidata":"https://www.wikidata.org/wiki/Q34379","display_name":"Musical instrument","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C552089266","wikidata":"https://www.wikidata.org/wiki/Q494510","display_name":"Voice","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.25839999318122864}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.17811410","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811410","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.17811410","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811410","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Musicians":[0],"and":[1,11,58,93,119],"nonmusicians":[2],"alike":[3],"use":[4],"rhythmic":[5,19,77,91],"sound":[6,78,143],"gestures,":[7],"such":[8],"as":[9,22,50],"tapping":[10],"beatboxing,":[12],"to":[13,80,97],"express":[14],"drum":[15,52,82,135],"patterns.":[16],"Often,":[17],"these":[18,37,48],"gestures":[20,38,79,144],"function":[21],"sketches":[23],"of":[24,88,104,133,142,149],"more":[25],"complex":[26],"patterns,":[27],"voicing":[28],"key":[29],"elements":[30],"while":[31],"eliding":[32],"or":[33],"implying":[34],"others.":[35],"While":[36],"provide":[39],"an":[40,85],"intuitive":[41],"method":[42],"for":[43,75],"communicating":[44],"musical":[45],"ideas,":[46],"realizing":[47],"ideas":[49],"fully-produced":[51],"recordings":[53],"often":[54],"requires":[55],"significant":[56],"time":[57],"skill.":[59],"To":[60],"bridge":[61],"this":[62],"gap,":[63],"we":[64],"present":[65],"TRIA":[66,101,125],"(The":[67],"Rhythm":[68],"In":[69],"Anything),":[70],"a":[71,94,105,124,146,152],"conditional":[72],"generative":[73],"model":[74,126],"mapping":[76],"high-fidelity":[81],"recordings.":[83],"Given":[84],"audio":[86,103],"prompt":[87,96],"the":[89,108,115],"desired":[90,109,116],"pattern":[92],"second":[95],"represent":[98],"drumkit":[99,106],"timbre,":[100],"produces":[102],"playing":[107],"rhythm":[110],"(with":[111],"appropriate":[112],"elaborations)":[113],"in":[114,151],"timbre.":[117],"Subjective":[118],"objective":[120],"evaluations":[121],"show":[122],"that":[123],"trained":[127],"on":[128],"less":[129],"than":[130],"10":[131],"hours":[132],"publicly-available":[134],"data":[136],"can":[137],"generate":[138],"high-quality,":[139],"faithful":[140],"realizations":[141],"across":[145],"wide":[147],"range":[148],"timbres":[150],"zero-shot":[153],"manner.":[154]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-12-05T00:00:00"}
