{"id":"https://openalex.org/W6912131496","doi":"https://doi.org/10.5281/zenodo.14877503","title":"Generating Sample-Based Musical Instruments Using Neural Audio Codec Language Models","display_name":"Generating Sample-Based Musical Instruments Using Neural Audio Codec Language Models","publication_year":2024,"publication_date":"2024-11-10","ids":{"openalex":"https://openalex.org/W6912131496","doi":"https://doi.org/10.5281/zenodo.14877503"},"language":"en","primary_location":{"id":"doi:10.5281/zenodo.14877503","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.14877503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.14877503","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Shahan Nercessian","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shahan Nercessian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Johannes Imort","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Johannes Imort","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ninon Devis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ninon Devis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Frederik Blang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Frederik Blang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35886002,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.8858000040054321,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.8858000040054321,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.040800001472234726,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.012600000016391277,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.5778999924659729},{"id":"https://openalex.org/keywords/active-listening","display_name":"Active listening","score":0.5595999956130981},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5095000267028809},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4526999890804291},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.35839998722076416},{"id":"https://openalex.org/keywords/sound-quality","display_name":"Sound quality","score":0.32760000228881836},{"id":"https://openalex.org/keywords/musical","display_name":"Musical","score":0.3230000138282776},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3197999894618988}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7401999831199646},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5907999873161316},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.5778999924659729},{"id":"https://openalex.org/C177291462","wikidata":"https://www.wikidata.org/wiki/Q423038","display_name":"Active listening","level":2,"score":0.5595999956130981},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5095000267028809},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4526999890804291},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38909998536109924},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38769999146461487},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.35839998722076416},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.32760000228881836},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.3230000138282776},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3197999894618988},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.29820001125335693},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.2904999852180481},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.2858000099658966},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.27570000290870667},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.2653000056743622},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.26499998569488525},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.263700008392334},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C2983311337","wikidata":"https://www.wikidata.org/wiki/Q34379","display_name":"Musical instrument","level":2,"score":0.2590999901294708},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.2563000023365021}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.14877503","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.14877503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.14877503","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.14877503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5715572834014893}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"this":[1,66,132],"paper,":[2],"we":[3,68,95],"propose":[4],"and":[5,46,81,110,148],"investigate":[6],"the":[7,16,57,103,107,112,120,143,152],"use":[8],"of":[9,19,106,145],"neural":[10],"audio":[11,28,35],"codec":[12],"language":[13],"models":[14],"for":[15,119,130],"automatic":[17],"generation":[18],"sample-based":[20],"musical":[21,92],"instruments":[22,59,109],"based":[23],"on":[24,39],"text":[25],"or":[26],"reference":[27],"prompts.":[29],"Our":[30,134],"approach":[31,88],"extends":[32],"a":[33,47,61,97,137],"generative":[34],"framework":[36],"to":[37,101,151],"condition":[38],"pitch":[40],"across":[41],"an":[42],"88-key":[43],"spectrum,":[44],"velocity,":[45],"combined":[48],"text/audio":[49],"embedding.":[50],"We":[51,74],"identify":[52],"maintaining":[53],"timbral":[54,104,141],"consistency":[55,105],"within":[56],"generated":[58,108,146],"as":[60],"major":[62],"challenge.":[63],"To":[64],"tackle":[65],"issue,":[67],"introduce":[69,96],"three":[70],"distinct":[71],"conditioning":[72],"schemes.":[73],"analyze":[75],"our":[76,87],"methods":[77],"through":[78],"objective":[79,99],"metrics":[80],"human":[82],"listening":[83],"tests,":[84],"demonstrating":[85],"that":[86,124],"can":[89],"produce":[90],"compelling":[91],"instruments.":[93],"Specifically,":[94],"new":[98],"metric":[100],"evaluate":[102],"adapt":[111],"average":[113],"Contrastive":[114],"Language-Audio":[115],"Pretraining":[116],"(CLAP)":[117],"score":[118],"text-to-instrument":[121],"case,":[122],"noting":[123],"its":[125],"naive":[126],"application":[127],"is":[128],"unsuitable":[129],"assessing":[131],"task.":[133],"findings":[135],"reveal":[136],"complex":[138],"interplay":[139],"between":[140],"consistency,":[142],"quality":[144],"samples,":[147],"their":[149],"correspondence":[150],"input":[153],"prompt.":[154]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
