{"id":"https://openalex.org/W4367359628","doi":"https://doi.org/10.1109/taslp.2023.3268730","title":"Diffsound: Discrete Diffusion Model for Text-to-Sound Generation","display_name":"Diffsound: Discrete Diffusion Model for Text-to-Sound Generation","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4367359628","doi":"https://doi.org/10.1109/taslp.2023.3268730"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3268730","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3268730","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043555011","display_name":"Dongchao Yang","orcid":"https://orcid.org/0000-0002-8905-224X"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Dongchao Yang","raw_affiliation_strings":["Advanced Data and Signal Processing Laboratory and School of Electronic and Computer Engineering, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Advanced Data and Signal Processing Laboratory and School of Electronic and Computer Engineering, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004643540","display_name":"Jianwei Yu","orcid":"https://orcid.org/0000-0002-2449-1436"},"institutions":[{"id":"https://openalex.org/I4210108985","display_name":"Bellevue Hospital Center","ror":"https://ror.org/01ky34z31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I1283621791","https://openalex.org/I4210086933","https://openalex.org/I4210108985"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jianwei Yu","raw_affiliation_strings":["Tencent AI Lab, Bellevue, WA, USA"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Bellevue, WA, USA","institution_ids":["https://openalex.org/I4210108985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101664824","display_name":"Helin Wang","orcid":"https://orcid.org/0000-0001-6088-0378"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Helin Wang","raw_affiliation_strings":["Advanced Data and Signal Processing Laboratory and School of Electronic and Computer Engineering, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Advanced Data and Signal Processing Laboratory and School of Electronic and Computer Engineering, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100396385","display_name":"Wen Wang","orcid":"https://orcid.org/0000-0003-1707-0163"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wen Wang","raw_affiliation_strings":["Advanced Data and Signal Processing Laboratory and School of Electronic and Computer Engineering, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Advanced Data and Signal Processing Laboratory and School of Electronic and Computer Engineering, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106404246","display_name":"Chao Weng","orcid":null},"institutions":[{"id":"https://openalex.org/I4210108985","display_name":"Bellevue Hospital Center","ror":"https://ror.org/01ky34z31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I1283621791","https://openalex.org/I4210086933","https://openalex.org/I4210108985"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chao Weng","raw_affiliation_strings":["Tencent AI Lab, Bellevue, WA, USA"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Bellevue, WA, USA","institution_ids":["https://openalex.org/I4210108985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002795838","display_name":"Yuexian Zou","orcid":"https://orcid.org/0000-0001-9999-6140"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuexian Zou","raw_affiliation_strings":["Advanced Data and Signal Processing Laboratory and School of Electronic and Computer Engineering, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Advanced Data and Signal Processing Laboratory and School of Electronic and Computer Engineering, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5034476404","display_name":"Dong Yu","orcid":"https://orcid.org/0000-0003-0520-6844"},"institutions":[{"id":"https://openalex.org/I4210108985","display_name":"Bellevue Hospital Center","ror":"https://ror.org/01ky34z31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I1283621791","https://openalex.org/I4210086933","https://openalex.org/I4210108985"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dong Yu","raw_affiliation_strings":["Tencent AI Lab, Bellevue, WA, USA"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Bellevue, WA, USA","institution_ids":["https://openalex.org/I4210108985"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5043555011"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":34.5178,"has_fulltext":false,"cited_by_count":178,"citation_normalized_percentile":{"value":0.998949,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":"31","issue":null,"first_page":"1720","last_page":"1733"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9919000267982483,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.9341681003570557},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.9239364862442017},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7200990319252014},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6868448853492737},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.6195789575576782},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5835521817207336},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4700514078140259},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.41583842039108276},{"id":"https://openalex.org/keywords/sound","display_name":"Sound (geography)","score":0.4124252200126648},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3761557638645172},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.2528116703033447},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.17263859510421753},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.08288848400115967},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.07714489102363586}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.9341681003570557},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.9239364862442017},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7200990319252014},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6868448853492737},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.6195789575576782},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5835521817207336},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4700514078140259},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.41583842039108276},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.4124252200126648},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3761557638645172},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.2528116703033447},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.17263859510421753},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.08288848400115967},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.07714489102363586},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2023.3268730","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3268730","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":87,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1956340063","https://openalex.org/W2108598243","https://openalex.org/W2120847449","https://openalex.org/W2129069237","https://openalex.org/W2183341477","https://openalex.org/W2296073425","https://openalex.org/W2405756170","https://openalex.org/W2506483933","https://openalex.org/W2519091744","https://openalex.org/W2593116425","https://openalex.org/W2752796333","https://openalex.org/W2896457183","https://openalex.org/W2908510526","https://openalex.org/W2916103538","https://openalex.org/W2935170919","https://openalex.org/W2952716587","https://openalex.org/W2963073614","https://openalex.org/W2963413689","https://openalex.org/W2963609956","https://openalex.org/W2963799213","https://openalex.org/W2963807156","https://openalex.org/W2970006822","https://openalex.org/W2971074500","https://openalex.org/W2972951102","https://openalex.org/W2985308740","https://openalex.org/W3015371781","https://openalex.org/W3015591594","https://openalex.org/W3036167779","https://openalex.org/W3046890131","https://openalex.org/W3092028330","https://openalex.org/W3129576130","https://openalex.org/W3129651364","https://openalex.org/W3136272958","https://openalex.org/W3162926177","https://openalex.org/W3165647589","https://openalex.org/W3166396011","https://openalex.org/W3168053944","https://openalex.org/W3172148458","https://openalex.org/W3172617364","https://openalex.org/W3174285493","https://openalex.org/W3174758275","https://openalex.org/W3180355996","https://openalex.org/W3185739472","https://openalex.org/W3196163807","https://openalex.org/W3198213150","https://openalex.org/W3201409833","https://openalex.org/W3207498282","https://openalex.org/W3214281017","https://openalex.org/W4224035735","https://openalex.org/W4226125322","https://openalex.org/W4287083626","https://openalex.org/W4287329820","https://openalex.org/W4292779060","https://openalex.org/W4312388283","https://openalex.org/W6631190155","https://openalex.org/W6713645886","https://openalex.org/W6725318829","https://openalex.org/W6736996214","https://openalex.org/W6755312952","https://openalex.org/W6757817989","https://openalex.org/W6762931180","https://openalex.org/W6763509872","https://openalex.org/W6765779288","https://openalex.org/W6767111847","https://openalex.org/W6778883912","https://openalex.org/W6779823529","https://openalex.org/W6780226713","https://openalex.org/W6783182287","https://openalex.org/W6783867762","https://openalex.org/W6788990321","https://openalex.org/W6790978476","https://openalex.org/W6791353385","https://openalex.org/W6792105156","https://openalex.org/W6795261426","https://openalex.org/W6795288823","https://openalex.org/W6796163713","https://openalex.org/W6796242362","https://openalex.org/W6796730497","https://openalex.org/W6797095309","https://openalex.org/W6798447524","https://openalex.org/W6798955355","https://openalex.org/W6799028840","https://openalex.org/W6800989748","https://openalex.org/W6802805937","https://openalex.org/W6809885388","https://openalex.org/W6810940779"],"related_works":["https://openalex.org/W2530685530","https://openalex.org/W2011227383","https://openalex.org/W4375868962","https://openalex.org/W2088854863","https://openalex.org/W1976719989","https://openalex.org/W2942893872","https://openalex.org/W2065606036","https://openalex.org/W3179495260","https://openalex.org/W3127543252","https://openalex.org/W2016904525"],"abstract_inverted_index":{"Generating":[0],"sound":[1,19,27,131,165,172],"effects":[2],"that":[3,40,97,230],"people":[4],"want":[5],"is":[6,85,269],"an":[7],"important":[8],"topic.":[9],"However,":[10,134],"there":[11],"are":[12],"limited":[13],"studies":[14],"in":[15,113,129,146,205,214],"this":[16,22,114],"area":[17],"for":[18],"generation.":[20],"In":[21],"study,":[23],"we":[24,106,182,286],"investigate":[25],"generating":[26],"conditioned":[28],"on":[29,108,188],"a":[30,35,43,46,52,55,74,93,110,184,251],"text":[31,44,66,71],"prompt":[32],"and":[33,54,81,154,208,265,303,313],"propose":[34,183],"novel":[36],"text-to-sound":[37,239],"generation":[38,103,132,166,240,253,267],"framework":[39,58],"consists":[41],"of":[42,79,156,201,283,315],"encoder,":[45],"Vector":[47],"Quantized":[48],"Variational":[49],"Autoencoder":[50],"(VQ-VAE),":[51],"token-decoder,":[53,123,163],"vocoder.":[56],"The":[57],"first":[59],"uses":[60],"the":[61,65,70,77,83,89,98,102,135,140,151,161,164,171,176,189,196,202,211,215,219,245,266,274,281,311,316],"token-decoder":[62,99,112,137,186,247],"to":[63,73,87,278],"transfer":[64],"features":[67],"extracted":[68],"from":[69],"encoder":[72],"mel-spectrogram":[75,91,141,203],"with":[76,118,160,170,244],"help":[78],"VQ-VAE,":[80],"then":[82,209],"vocoder":[84],"used":[86],"transform":[88],"generated":[90,284,317],"into":[92],"waveform.":[94],"We":[95,116],"found":[96],"significantly":[100],"influences":[101],"performance.":[104],"Thus,":[105],"focus":[107],"designing":[109],"good":[111],"study.":[115],"begin":[117],"th21e":[119],"traditional":[120],"autoregressive":[121],"(AR)":[122],"which":[124,148,307],"has":[125,250],"shown":[126],"state-of-the-art":[127],"performance":[128],"previous":[130],"works.":[133],"AR":[136,162,180,246,275],"always":[138],"predicts":[139,199],"tokens":[142,204,213],"one":[143,145,206],"by":[144,179,225],"order,":[147],"may":[149],"introduce":[150],"unidirectional":[152],"bias":[153],"accumulation":[155],"errors":[157],"problems.":[158],"Moreover,":[159],"time":[167],"increases":[168],"linearly":[169],"duration.":[173],"To":[174],"overcome":[175],"shortcomings":[177],"introduced":[178],"token-decoders,":[181],"non-autoregressive":[185],"based":[187],"discrete":[190],"diffusion":[191],"model,":[192],"named":[193],"Diffsound.":[194],"Specifically,":[195],"Diffsound":[197,233],"model":[198,234],"all":[200],"step":[207],"refines":[210],"predicted":[212],"next":[216],"step,":[217],"so":[218],"best-predicted":[220],"results":[221,241],"can":[222,308],"be":[223],"obtained":[224],"iteration.":[226],"Our":[227],"experiments":[228],"show":[229],"our":[231],"proposed":[232],"not":[235],"only":[236],"produces":[237],"better":[238],"when":[242],"compared":[243],"but":[248],"also":[249],"faster":[252,272],"speed,":[254],"<italic":[255,261,293],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[256,262,294],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">i.e.</i>":[257,295],",":[258,296],"MOS:":[259],"3.56":[260],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">v.s</i>":[263],"2.786,":[264],"speed":[268],"five":[270],"times":[271],"than":[273],"decoder.":[276],"Furthermore,":[277],"automatically":[279],"assess":[280,310],"quality":[282],"samples,":[285],"define":[287],"three":[288],"different":[289],"objective":[290],"evaluation":[291],"metrics":[292],"Fr\u00e9chet":[297],"Inception":[298],"Distance":[299],"(FID),":[300],"Kullback-Leibler":[301],"(KL),":[302],"audio":[304],"caption":[305],"loss,":[306],"comprehensively":[309],"relevance":[312],"fidelity":[314],"samples.":[318]},"counts_by_year":[{"year":2026,"cited_by_count":15},{"year":2025,"cited_by_count":63},{"year":2024,"cited_by_count":73},{"year":2023,"cited_by_count":26},{"year":2022,"cited_by_count":1}],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-10-10T00:00:00"}
