{"id":"https://openalex.org/W3197273793","doi":"https://doi.org/10.21437/interspeech.2021-1016","title":"UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation","display_name":"UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation","publication_year":2021,"publication_date":"2021-08-27","ids":{"openalex":"https://openalex.org/W3197273793","doi":"https://doi.org/10.21437/interspeech.2021-1016","mag":"3197273793"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2021-1016","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-1016","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103192158","display_name":"Won Jang","orcid":"https://orcid.org/0000-0002-4711-780X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Won Jang","raw_affiliation_strings":["Kakao Enterprise Corporation, Seongnam, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Kakao Enterprise Corporation, Seongnam, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027636448","display_name":"Dan Lim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dan Lim","raw_affiliation_strings":["Kakao Corporation, Seongnam, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Kakao Corporation, Seongnam, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050154985","display_name":"Jaesam Yoon","orcid":"https://orcid.org/0000-0002-9978-0582"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jaesam Yoon","raw_affiliation_strings":["Kakao Enterprise Corporation, Seongnam, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Kakao Enterprise Corporation, Seongnam, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061409578","display_name":"Bong\u2010Wan Kim","orcid":"https://orcid.org/0000-0001-9059-0451"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bongwan Kim","raw_affiliation_strings":["Kakao Enterprise Corporation, Seongnam, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Kakao Enterprise Corporation, Seongnam, Republic of Korea","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100710223","display_name":"Juntae Kim","orcid":"https://orcid.org/0000-0003-3344-4591"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Juntae Kim","raw_affiliation_strings":["Kakao Enterprise Corporation, Seongnam, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Kakao Enterprise Corporation, Seongnam, Republic of Korea","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100710223"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":11.5772,"has_fulltext":false,"cited_by_count":107,"citation_normalized_percentile":{"value":0.99109176,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"2207","last_page":"2211"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9843999743461609,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11447","display_name":"Blind Source Separation Techniques","score":0.9606999754905701,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.9366955757141113},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.7097399234771729},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6774333715438843},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.5391265749931335},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5290590524673462},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.511845052242279},{"id":"https://openalex.org/keywords/resolution","display_name":"Resolution (logic)","score":0.45917534828186035},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3307352364063263},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.23490867018699646},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.09595295786857605},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.09263768792152405}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.9366955757141113},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.7097399234771729},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6774333715438843},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.5391265749931335},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5290590524673462},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.511845052242279},{"id":"https://openalex.org/C138268822","wikidata":"https://www.wikidata.org/wiki/Q1051925","display_name":"Resolution (logic)","level":2,"score":0.45917534828186035},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3307352364063263},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.23490867018699646},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.09595295786857605},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.09263768792152405},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2021-1016","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-1016","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7300000190734863,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1552314771","https://openalex.org/W2049686551","https://openalex.org/W2120847449","https://openalex.org/W2197404611","https://openalex.org/W2284050935","https://openalex.org/W2423557781","https://openalex.org/W2471520273","https://openalex.org/W2519091744","https://openalex.org/W2593414223","https://openalex.org/W2889329491","https://openalex.org/W2963300588","https://openalex.org/W2963975282","https://openalex.org/W2964243274","https://openalex.org/W2970006822","https://openalex.org/W2972359262","https://openalex.org/W2993118648","https://openalex.org/W3015338123","https://openalex.org/W3033411150","https://openalex.org/W3092028330","https://openalex.org/W3096442195","https://openalex.org/W3130774171","https://openalex.org/W3144035034","https://openalex.org/W3150572638","https://openalex.org/W3161236344","https://openalex.org/W4252713891","https://openalex.org/W4298580827","https://openalex.org/W4320013936"],"related_works":["https://openalex.org/W4236436342","https://openalex.org/W3207498282","https://openalex.org/W4252591235","https://openalex.org/W3082910224","https://openalex.org/W2951616896","https://openalex.org/W4287672314","https://openalex.org/W2468514837","https://openalex.org/W4289542528","https://openalex.org/W2889822993","https://openalex.org/W4213393969"],"abstract_inverted_index":{"Most":[0],"neural":[1,54],"vocoders":[2],"employ":[3],"band-limited":[4],"mel-spectrograms":[5,90],"to":[6,95,154],"generate":[7,96],"waveforms.If":[8],"full-band":[9,33,89],"spectral":[10],"features":[11],"are":[12,45],"used":[13],"as":[14,23,27,39,91,109],"the":[15,17,66,110,125,142,149],"input,":[16,92],"vocoder":[18,55],"can":[19],"be":[20],"provided":[21],"with":[22],"much":[24],"acoustic":[25],"information":[26,118],"possible.However,":[28],"in":[29,60,65],"some":[30],"models":[31,133],"employing":[32],"mel-spectrograms,":[34],"an":[35,112],"over-smoothing":[36],"problem":[37],"occurs":[38],"part":[40],"of":[41,68,106,121],"which":[42],"non-sharp":[43],"spectrograms":[44,105],"generated.To":[46],"address":[47],"this":[48],"problem,":[49],"we":[50,72,93],"propose":[51],"UnivNet,":[52],"a":[53,74,101,115,158],"that":[56,78,103],"synthesizes":[57],"high-fidelity":[58],"waveforms":[59],"real":[61],"time.Inspired":[62],"by":[63,99],"works":[64],"field":[67],"voice":[69],"activity":[70],"detection,":[71],"added":[73],"multiresolution":[75],"spectrogram":[76,82],"discriminator":[77,102],"employs":[79,104],"multiple":[80,107],"linear":[81],"magnitudes":[83],"computed":[84],"using":[85],"various":[86],"parameter":[87],"sets.Using":[88],"expect":[94],"high-resolution":[97],"signals":[98],"adding":[100],"resolutions":[108],"input.In":[111],"evaluation":[113],"on":[114,119],"dataset":[116],"containing":[117],"hundreds":[120],"speakers,":[122],"UnivNet":[123],"obtained":[124],"best":[126,143],"objective":[127],"and":[128,137],"subjective":[129,144],"results":[130],"among":[131],"competing":[132],"for":[134,146,151,160],"both":[135],"seen":[136],"unseen":[138],"speakers.These":[139],"results,":[140],"including":[141],"score":[145],"text-to-speech,":[147],"demonstrate":[148],"potential":[150],"fast":[152],"adaptation":[153],"new":[155],"speakers":[156],"without":[157],"need":[159],"training":[161],"from":[162],"scratch.":[163]},"counts_by_year":[{"year":2026,"cited_by_count":11},{"year":2025,"cited_by_count":21},{"year":2024,"cited_by_count":32},{"year":2023,"cited_by_count":27},{"year":2022,"cited_by_count":14},{"year":2021,"cited_by_count":2}],"updated_date":"2026-05-07T13:39:58.223016","created_date":"2025-10-10T00:00:00"}
