{"id":"https://openalex.org/W4391615163","doi":"https://doi.org/10.1109/icassp48485.2024.10446830","title":"SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis","display_name":"SpecDiff-GAN: A Spectrally-Shaped Noise Diffusion GAN for Speech and Music Synthesis","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4391615163","doi":"https://doi.org/10.1109/icassp48485.2024.10446830"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446830","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446830","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2402.01753","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093882801","display_name":"Teysir Baoueb","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165912","display_name":"Laboratoire Traitement et Communication de l\u2019Information","ror":"https://ror.org/057er4c39","country_code":"FR","type":"facility","lineage":["https://openalex.org/I12356871","https://openalex.org/I4210145102","https://openalex.org/I4210145102","https://openalex.org/I4210165912"]}],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Teysir Baoueb","raw_affiliation_strings":["IP-Paris,LTCI, T&#x00E9;l&#x00E9;com Paris,France"],"affiliations":[{"raw_affiliation_string":"IP-Paris,LTCI, T&#x00E9;l&#x00E9;com Paris,France","institution_ids":["https://openalex.org/I4210165912"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070500491","display_name":"Haochen Liu","orcid":"https://orcid.org/0000-0002-2991-3642"},"institutions":[{"id":"https://openalex.org/I4210165912","display_name":"Laboratoire Traitement et Communication de l\u2019Information","ror":"https://ror.org/057er4c39","country_code":"FR","type":"facility","lineage":["https://openalex.org/I12356871","https://openalex.org/I4210145102","https://openalex.org/I4210145102","https://openalex.org/I4210165912"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Haocheng Liu","raw_affiliation_strings":["IP-Paris,LTCI, T&#x00E9;l&#x00E9;com Paris,France"],"affiliations":[{"raw_affiliation_string":"IP-Paris,LTCI, T&#x00E9;l&#x00E9;com Paris,France","institution_ids":["https://openalex.org/I4210165912"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082784886","display_name":"Mathieu Fontaine","orcid":"https://orcid.org/0000-0002-7657-6271"},"institutions":[{"id":"https://openalex.org/I4210165912","display_name":"Laboratoire Traitement et Communication de l\u2019Information","ror":"https://ror.org/057er4c39","country_code":"FR","type":"facility","lineage":["https://openalex.org/I12356871","https://openalex.org/I4210145102","https://openalex.org/I4210145102","https://openalex.org/I4210165912"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Mathieu Fontaine","raw_affiliation_strings":["IP-Paris,LTCI, T&#x00E9;l&#x00E9;com Paris,France"],"affiliations":[{"raw_affiliation_string":"IP-Paris,LTCI, T&#x00E9;l&#x00E9;com Paris,France","institution_ids":["https://openalex.org/I4210165912"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076453358","display_name":"Jonathan Le Roux","orcid":"https://orcid.org/0000-0002-3451-171X"},"institutions":[{"id":"https://openalex.org/I4210159266","display_name":"Mitsubishi Electric (United States)","ror":"https://ror.org/053jnhe44","country_code":"US","type":"company","lineage":["https://openalex.org/I1306287861","https://openalex.org/I4210133125","https://openalex.org/I4210159266"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jonathan Le Roux","raw_affiliation_strings":["Mitsubishi Electric Research Laboratories (MERL),Cambridge,MA,USA","Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Mitsubishi Electric Research Laboratories (MERL),Cambridge,MA,USA","institution_ids":["https://openalex.org/I4210159266"]},{"raw_affiliation_string":"Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA","institution_ids":["https://openalex.org/I4210159266"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055423112","display_name":"Ga\u00ebl Richard","orcid":"https://orcid.org/0000-0002-4960-0010"},"institutions":[{"id":"https://openalex.org/I4210165912","display_name":"Laboratoire Traitement et Communication de l\u2019Information","ror":"https://ror.org/057er4c39","country_code":"FR","type":"facility","lineage":["https://openalex.org/I12356871","https://openalex.org/I4210145102","https://openalex.org/I4210145102","https://openalex.org/I4210165912"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Ga\u00ebl Richard","raw_affiliation_strings":["IP-Paris,LTCI, T&#x00E9;l&#x00E9;com Paris,France"],"affiliations":[{"raw_affiliation_string":"IP-Paris,LTCI, T&#x00E9;l&#x00E9;com Paris,France","institution_ids":["https://openalex.org/I4210165912"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5093882801"],"corresponding_institution_ids":["https://openalex.org/I4210165912"],"apc_list":null,"apc_paid":null,"fwci":2.3821,"has_fulltext":true,"cited_by_count":7,"citation_normalized_percentile":{"value":0.8858687,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"986","last_page":"990"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5830994844436646},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.5348293781280518},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.5222419500350952},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.45928052067756653},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.44649070501327515},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4093775749206543},{"id":"https://openalex.org/keywords/optoelectronics","display_name":"Optoelectronics","score":0.370775043964386},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.29413795471191406},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.08480286598205566}],"concepts":[{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5830994844436646},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.5348293781280518},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.5222419500350952},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.45928052067756653},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.44649070501327515},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4093775749206543},{"id":"https://openalex.org/C49040817","wikidata":"https://www.wikidata.org/wiki/Q193091","display_name":"Optoelectronics","level":1,"score":0.370775043964386},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.29413795471191406},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.08480286598205566},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446830","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446830","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2402.01753","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.01753","pdf_url":"https://arxiv.org/pdf/2402.01753","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"pmh:oai:HAL:hal-04423979v1","is_oa":true,"landing_page_url":"https://hal.science/hal-04423979","pdf_url":"https://hal.science/hal-04423979/document","source":{"id":"https://openalex.org/S4406922461","display_name":"SPIRE - Sciences Po Institutional REpository","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"IEEE International Conference on Acoustics, Speech and Signal Processing, Apr 2024, Seoul (Korea), South Korea. &#x27E8;10.1109/ICASSP48485.2024.10446830&#x27E9;","raw_type":"Conference papers"},{"id":"doi:10.48550/arxiv.2402.01753","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2402.01753","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2402.01753","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.01753","pdf_url":"https://arxiv.org/pdf/2402.01753","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4326088756","display_name":null,"funder_award_id":"101052978","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"}],"funders":[{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4391615163.pdf"},"referenced_works_count":33,"referenced_works":["https://openalex.org/W2067295501","https://openalex.org/W2519091744","https://openalex.org/W2526050071","https://openalex.org/W2910577860","https://openalex.org/W2963300588","https://openalex.org/W2964243274","https://openalex.org/W2972478942","https://openalex.org/W2972495969","https://openalex.org/W2998572311","https://openalex.org/W3098557217","https://openalex.org/W3099378280","https://openalex.org/W3160077247","https://openalex.org/W3197273793","https://openalex.org/W4225566824","https://openalex.org/W4243637566","https://openalex.org/W4281736089","https://openalex.org/W4287120591","https://openalex.org/W4299412574","https://openalex.org/W4372346639","https://openalex.org/W4400381851","https://openalex.org/W6607890289","https://openalex.org/W6633117090","https://openalex.org/W6717951019","https://openalex.org/W6746775625","https://openalex.org/W6758675244","https://openalex.org/W6772349387","https://openalex.org/W6779093361","https://openalex.org/W6779577414","https://openalex.org/W6779823529","https://openalex.org/W6782760101","https://openalex.org/W6783867762","https://openalex.org/W6785529287","https://openalex.org/W6838910450"],"related_works":["https://openalex.org/W2386796262","https://openalex.org/W3177740930","https://openalex.org/W2031453039","https://openalex.org/W1484549928","https://openalex.org/W1974805659","https://openalex.org/W2811388477","https://openalex.org/W2549138563","https://openalex.org/W2335010068","https://openalex.org/W2899315007","https://openalex.org/W2011653655"],"abstract_inverted_index":{"Generative":[0],"adversarial":[1],"network":[2],"(GAN)":[3],"models":[4],"can":[5],"synthesize":[6],"high-quality":[7],"audio":[8,137],"signals":[9],"while":[10],"ensuring":[11],"fast":[12],"sample":[13],"generation.":[14],"However,":[15],"they":[16],"are":[17,22],"difficult":[18],"to":[19,24,78,87,104,142],"train":[20],"and":[21,30,81,122,139],"prone":[23],"several":[25,126,143],"issues":[26],"including":[27],"mode":[28],"collapse":[29],"divergence.":[31],"In":[32,54],"this":[33],"paper,":[34],"we":[35],"introduce":[36],"SpecDiff-GAN,":[37],"a":[38,65,75,97],"neural":[39],"vocoder":[40],"based":[41],"on":[42,125],"HiFi-GAN,":[43],"which":[44,69],"was":[45],"initially":[46],"devised":[47],"for":[48,120],"speech":[49,121],"synthesis":[50,124],"from":[51,74],"mel":[52],"spectrogram.":[53],"our":[55,117,132],"model,":[56],"the":[57,88,93,102,106,114],"training":[58],"stability":[59],"is":[60],"enhanced":[61],"by":[62,95],"means":[63],"of":[64,116],"forward":[66],"diffusion":[67],"process":[68],"consists":[70],"in":[71,136],"injecting":[72],"noise":[73,99],"Gaussian":[76],"distribution":[77,100],"both":[79],"real":[80],"fake":[82],"samples":[83],"before":[84],"inputting":[85],"them":[86],"discriminator.":[89],"We":[90,111],"further":[91],"improve":[92],"model":[94,119,133],"exploiting":[96],"spectrally-shaped":[98],"with":[101],"aim":[103],"make":[105],"discriminator's":[107],"task":[108],"more":[109],"challenging.":[110],"then":[112],"show":[113],"merits":[115],"proposed":[118],"music":[123],"datasets.":[127],"Our":[128],"experiments":[129],"confirm":[130],"that":[131],"compares":[134],"favorably":[135],"quality":[138],"efficiency":[140],"compared":[141],"baselines.":[144]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2024-02-08T00:00:00"}
