{"id":"https://openalex.org/W4224861944","doi":"https://doi.org/10.21437/interspeech.2022-737","title":"Hierarchical and Multi-Scale Variational Autoencoder for Diverse and Natural Non-Autoregressive Text-to-Speech","display_name":"Hierarchical and Multi-Scale Variational Autoencoder for Diverse and Natural Non-Autoregressive Text-to-Speech","publication_year":2022,"publication_date":"2022-09-16","ids":{"openalex":"https://openalex.org/W4224861944","doi":"https://doi.org/10.21437/interspeech.2022-737"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2022-737","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-737","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060429696","display_name":"Jae\u2010sung Bae","orcid":"https://orcid.org/0000-0002-8270-0072"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jaesung Bae","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073217180","display_name":"Jinhyeok Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinhyeok Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073688244","display_name":"Taejun Bak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Taejun Bak","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101800027","display_name":"Young-Sun Joo","orcid":"https://orcid.org/0000-0002-7428-5868"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Young-Sun Joo","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5060429696"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.3122,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.47524646,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9175999760627747,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.947013258934021},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.786596417427063},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.7284695506095886},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.6927698850631714},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6833550930023193},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6268447637557983},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5042842626571655},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4844663143157959},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4579104781150818},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.450887531042099},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4174368381500244},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4121996760368347},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.28181755542755127},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.14252519607543945},{"id":"https://openalex.org/keywords/econometrics","display_name":"Econometrics","score":0.05842289328575134}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.947013258934021},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.786596417427063},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.7284695506095886},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.6927698850631714},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6833550930023193},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6268447637557983},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5042842626571655},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4844663143157959},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4579104781150818},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.450887531042099},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4174368381500244},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4121996760368347},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.28181755542755127},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.14252519607543945},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.05842289328575134},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2022-737","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-737","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6200000047683716}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W169399214","https://openalex.org/W3100825170","https://openalex.org/W1914543332","https://openalex.org/W2946856121","https://openalex.org/W1927421023","https://openalex.org/W10581632","https://openalex.org/W2108985546","https://openalex.org/W2433276473","https://openalex.org/W3149582125","https://openalex.org/W2077992636"],"abstract_inverted_index":{"This":[0],"paper":[1],"proposes":[2],"a":[3],"hierarchical":[4],"and":[5,34,45,67,79,110,122],"multi-scale":[6],"variational":[7,120],"autoencoder-based":[8],"non-autoregressive":[9,24],"text-to-speech":[10],"model":[11,60,105],"(HiMuV-TTS)":[12],"to":[13,49,115],"generate":[14,107],"natural":[15,111],"speech":[16,91,112],"with":[17,118],"diverse":[18,109],"speaking":[19,43],"styles.":[20],"Recent":[21],"advances":[22],"in":[23,128],"TTS":[25,116],"(NAR-TTS)":[26],"models":[27,117],"have":[28],"significantly":[29],"improved":[30],"the":[31,40,58,64,70,76,80,88,94,102],"inference":[32],"speed":[33],"robustness":[35],"of":[36,42,90],"synthesized":[37],"speech.":[38],"However,":[39],"diversity":[41],"styles":[44],"naturalness":[46],"are":[47],"needed":[48],"be":[50],"improved.":[51],"To":[52],"solve":[53],"this":[54],"problem,":[55],"we":[56,86],"propose":[57],"HiMuV-TTS":[59,104],"that":[61,101],"first":[62],"determines":[63,69],"global-scale":[65,77],"prosody":[66,72,78,126],"then":[68],"local-scale":[71],"via":[73],"conditioning":[74],"on":[75],"learned":[81],"text":[82],"representation.":[83],"In":[84],"addition,":[85],"improve":[87],"quality":[89],"by":[92],"adopting":[93],"adversarial":[95],"training":[96],"technique.":[97],"Experimental":[98],"results":[99],"verify":[100],"proposed":[103],"can":[106,123],"more":[108],"as":[113],"compared":[114],"single-scale":[119],"autoencoders,":[121],"represent":[124],"different":[125],"information":[127],"each":[129],"scale.":[130]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
