{"id":"https://openalex.org/W3194822862","doi":"https://doi.org/10.21437/ssw.2021-14","title":"Enhancing audio quality for expressive Neural Text-to-Speech","display_name":"Enhancing audio quality for expressive Neural Text-to-Speech","publication_year":2021,"publication_date":"2021-08-24","ids":{"openalex":"https://openalex.org/W3194822862","doi":"https://doi.org/10.21437/ssw.2021-14","mag":"3194822862"},"language":"en","primary_location":{"id":"doi:10.21437/ssw.2021-14","is_oa":false,"landing_page_url":"https://doi.org/10.21437/ssw.2021-14","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"11th ISCA Speech Synthesis Workshop (SSW 11)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027412678","display_name":"Abdelhamid Ezzerg","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Abdelhamid Ezzerg","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031116817","display_name":"Adam Gabry\u015b","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Adam Gabrys","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046126186","display_name":"Bartosz Putrycz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bartosz Putrycz","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039620116","display_name":"Daniel Korzekwa","orcid":"https://orcid.org/0000-0003-1470-4968"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daniel Korzekwa","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087232606","display_name":"Daniel S\u00e1ez-Trigueros","orcid":"https://orcid.org/0000-0001-9623-7472"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daniel Saez-Trigueros","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001865400","display_name":"David McHardy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"David McHardy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000904701","display_name":"Kamil Pokora","orcid":"https://orcid.org/0009-0006-0756-4118"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kamil Pokora","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063140150","display_name":"Jakub Lachowicz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jakub Lachowicz","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019530089","display_name":"Jaime Lorenzo-Trueba","orcid":"https://orcid.org/0000-0003-0459-1429"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jaime Lorenzo-Trueba","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5026578737","display_name":"Viacheslav Klimkov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Viacheslav Klimkov","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5027412678"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.6798,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.75670099,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9599999785423279,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9570000171661377,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7892760038375854},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6618614196777344},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5940433144569397},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.44782838225364685},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4135516285896301},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3283435106277466},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.3221118748188019}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7892760038375854},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6618614196777344},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5940433144569397},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.44782838225364685},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4135516285896301},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3283435106277466},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3221118748188019},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/ssw.2021-14","is_oa":false,"landing_page_url":"https://doi.org/10.21437/ssw.2021-14","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"11th ISCA Speech Synthesis Workshop (SSW 11)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W854541894","https://openalex.org/W1572730534","https://openalex.org/W1959608418","https://openalex.org/W2024093103","https://openalex.org/W2133564696","https://openalex.org/W2406990556","https://openalex.org/W2519091744","https://openalex.org/W2604184139","https://openalex.org/W2605141709","https://openalex.org/W2666408839","https://openalex.org/W2746654391","https://openalex.org/W2785678896","https://openalex.org/W2794490148","https://openalex.org/W2804078698","https://openalex.org/W2886769154","https://openalex.org/W2941649920","https://openalex.org/W2951523806","https://openalex.org/W2962717182","https://openalex.org/W2962770929","https://openalex.org/W2963341071","https://openalex.org/W2963542120","https://openalex.org/W2963600562","https://openalex.org/W2963987720","https://openalex.org/W2964167449","https://openalex.org/W2964168257","https://openalex.org/W2964243274","https://openalex.org/W2972702018","https://openalex.org/W3015922793","https://openalex.org/W3111551290","https://openalex.org/W4289383906","https://openalex.org/W4294619240","https://openalex.org/W4297663084","https://openalex.org/W4297825690","https://openalex.org/W4298289240"],"related_works":["https://openalex.org/W2391251536","https://openalex.org/W2362198218","https://openalex.org/W1984922432","https://openalex.org/W1982750869","https://openalex.org/W2375008505","https://openalex.org/W2086348228","https://openalex.org/W2019521278","https://openalex.org/W2350679292","https://openalex.org/W2113077220","https://openalex.org/W2348218075"],"abstract_inverted_index":{"Artificial":[0],"speech":[1,21],"synthesis":[2],"has":[3],"made":[4],"a":[5,52,57,68,82],"great":[6],"leap":[7],"in":[8,56,105,114,133,144],"terms":[9,145],"of":[10,19,70,81,88,111,146],"naturalness":[11,135],"as":[12],"recent":[13,44],"Text-to-Speech":[14],"(TTS)":[15],"systems":[16],"are":[17,32,39],"capable":[18],"producing":[20],"with":[22],"similar":[23],"quality":[24,80],"to":[25,34,43,50,76],"human":[26],"recordings.However,":[27],"not":[28],"all":[29],"speaking":[30],"styles":[31],"easy":[33],"model:":[35],"highly":[36],"expressive":[37,151],"voices":[38],"still":[40],"challenging":[41],"even":[42],"TTS":[45],"architectures":[46],"since":[47],"there":[48],"seems":[49],"be":[51,74],"trade-off":[53],"between":[54,136],"expressiveness":[55],"generated":[58],"audio":[59],"and":[60,108,119,140],"its":[61],"signal":[62,79],"quality.In":[63],"this":[64],"paper,":[65],"we":[66],"present":[67],"set":[69],"techniques":[71,92,128],"that":[72],"can":[73],"leveraged":[75],"enhance":[77],"the":[78,86,95,109,116,120,131,137],"highly-expressive":[83],"voice":[84],"without":[85],"use":[87,110],"additional":[89],"data.The":[90],"proposed":[91],"include:":[93],"tuning":[94],"autoregressive":[96],"loop's":[97],"granularity":[98],"during":[99],"training;":[100],"using":[101],"Generative":[102],"Adversarial":[103],"Networks":[104],"acoustic":[106,117],"modeling;":[107],"Variational":[112],"Auto-Encoders":[113],"both":[115],"model":[118],"neural":[121],"vocoder.We":[122],"show":[123],"that,":[124],"when":[125],"combined,":[126],"these":[127],"greatly":[129],"closed":[130],"gap":[132],"perceived":[134],"baseline":[138],"system":[139],"recordings":[141],"by":[142],"39%":[143],"MUSHRA":[147],"scores":[148],"for":[149],"an":[150],"celebrity":[152],"voice.":[153]},"counts_by_year":[{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
