{"id":"https://openalex.org/W2887511658","doi":"https://doi.org/10.1109/access.2018.2872060","title":"Wasserstein GAN and Waveform Loss-Based Acoustic Model Training for Multi-Speaker Text-to-Speech Synthesis Systems Using a WaveNet Vocoder","display_name":"Wasserstein GAN and Waveform Loss-Based Acoustic Model Training for Multi-Speaker Text-to-Speech Synthesis Systems Using a WaveNet Vocoder","publication_year":2018,"publication_date":"2018-01-01","ids":{"openalex":"https://openalex.org/W2887511658","doi":"https://doi.org/10.1109/access.2018.2872060","mag":"2887511658"},"language":"en","primary_location":{"id":"doi:10.1109/access.2018.2872060","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2018.2872060","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1109/access.2018.2872060","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103063144","display_name":"Yi Zhao","orcid":"https://orcid.org/0000-0002-3555-9408"},"institutions":[{"id":"https://openalex.org/I14396692","display_name":"Tokyo University of Information Sciences","ror":"https://ror.org/044bdx604","country_code":"JP","type":"education","lineage":["https://openalex.org/I14396692"]},{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Yi Zhao","raw_affiliation_strings":["Department of Electrical Engineering and Information Systems, The University of Tokyo, Tokyo, Japan","Department of Electrical Engineering and Information Systems, Graduate School of Engineering, The University of Tokyo, Tokyo, Japan","ORCiD"],"raw_orcid":"https://orcid.org/0000-0002-3555-9408","affiliations":[{"raw_affiliation_string":"Department of Electrical Engineering and Information Systems, The University of Tokyo, Tokyo, Japan","institution_ids":["https://openalex.org/I14396692","https://openalex.org/I74801974"]},{"raw_affiliation_string":"Department of Electrical Engineering and Information Systems, Graduate School of Engineering, The University of Tokyo, Tokyo, Japan","institution_ids":["https://openalex.org/I14396692","https://openalex.org/I74801974"]},{"raw_affiliation_string":"ORCiD","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062895056","display_name":"Shinji Takaki","orcid":"https://orcid.org/0000-0001-7294-7699"},"institutions":[{"id":"https://openalex.org/I184597095","display_name":"National Institute of Informatics","ror":"https://ror.org/04ksd4g47","country_code":"JP","type":"facility","lineage":["https://openalex.org/I1319490839","https://openalex.org/I184597095","https://openalex.org/I4210158934"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shinji Takaki","raw_affiliation_strings":["Digital Content and Media Sciences Research Division, National Institute of Informatics, Tokyo, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Digital Content and Media Sciences Research Division, National Institute of Informatics, Tokyo, Japan","institution_ids":["https://openalex.org/I184597095"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002145453","display_name":"Hieu-Thi Luong","orcid":"https://orcid.org/0000-0002-4772-5995"},"institutions":[{"id":"https://openalex.org/I184597095","display_name":"National Institute of Informatics","ror":"https://ror.org/04ksd4g47","country_code":"JP","type":"facility","lineage":["https://openalex.org/I1319490839","https://openalex.org/I184597095","https://openalex.org/I4210158934"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hieu-Thi Luong","raw_affiliation_strings":["Digital Content and Media Sciences Research Division, National Institute of Informatics, Tokyo, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Digital Content and Media Sciences Research Division, National Institute of Informatics, Tokyo, Japan","institution_ids":["https://openalex.org/I184597095"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007639385","display_name":"Junichi Yamagishi","orcid":"https://orcid.org/0000-0003-2752-3955"},"institutions":[{"id":"https://openalex.org/I184597095","display_name":"National Institute of Informatics","ror":"https://ror.org/04ksd4g47","country_code":"JP","type":"facility","lineage":["https://openalex.org/I1319490839","https://openalex.org/I184597095","https://openalex.org/I4210158934"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Junichi Yamagishi","raw_affiliation_strings":["Digital Content and Media Sciences Research Division, National Institute of Informatics, Tokyo, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Digital Content and Media Sciences Research Division, National Institute of Informatics, Tokyo, Japan","institution_ids":["https://openalex.org/I184597095"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010841595","display_name":"Daisuke Saito","orcid":"https://orcid.org/0000-0003-3200-579X"},"institutions":[{"id":"https://openalex.org/I14396692","display_name":"Tokyo University of Information Sciences","ror":"https://ror.org/044bdx604","country_code":"JP","type":"education","lineage":["https://openalex.org/I14396692"]},{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Daisuke Saito","raw_affiliation_strings":["Department of Electrical Engineering and Information Systems, The University of Tokyo, Tokyo, Japan","Department of Electrical Engineering and Information Systems, Graduate School of Engineering, The University of Tokyo, Tokyo, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical Engineering and Information Systems, The University of Tokyo, Tokyo, Japan","institution_ids":["https://openalex.org/I14396692","https://openalex.org/I74801974"]},{"raw_affiliation_string":"Department of Electrical Engineering and Information Systems, Graduate School of Engineering, The University of Tokyo, Tokyo, Japan","institution_ids":["https://openalex.org/I14396692","https://openalex.org/I74801974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5041213266","display_name":"Nobuaki Minematsu","orcid":"https://orcid.org/0000-0002-8778-9555"},"institutions":[{"id":"https://openalex.org/I14396692","display_name":"Tokyo University of Information Sciences","ror":"https://ror.org/044bdx604","country_code":"JP","type":"education","lineage":["https://openalex.org/I14396692"]},{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Nobuaki Minematsu","raw_affiliation_strings":["Department of Electrical Engineering and Information Systems, The University of Tokyo, Tokyo, Japan","Department of Electrical Engineering and Information Systems, Graduate School of Engineering, The University of Tokyo, Tokyo, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical Engineering and Information Systems, The University of Tokyo, Tokyo, Japan","institution_ids":["https://openalex.org/I14396692","https://openalex.org/I74801974"]},{"raw_affiliation_string":"Department of Electrical Engineering and Information Systems, Graduate School of Engineering, The University of Tokyo, Tokyo, Japan","institution_ids":["https://openalex.org/I14396692","https://openalex.org/I74801974"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5103063144"],"corresponding_institution_ids":["https://openalex.org/I14396692","https://openalex.org/I74801974"],"apc_list":{"value":1850,"currency":"USD","value_usd":1850},"apc_paid":{"value":1850,"currency":"USD","value_usd":1850},"fwci":8.2996,"has_fulltext":false,"cited_by_count":68,"citation_normalized_percentile":{"value":0.97998502,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":"6","issue":null,"first_page":"60478","last_page":"60488"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7775459289550781},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.704052209854126},{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.6639015674591064},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.6461116075515747},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.5551321506500244},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5350092053413391},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.4356009364128113},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.4322899281978607},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3057050406932831},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.2806679904460907},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.07197389006614685}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7775459289550781},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.704052209854126},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.6639015674591064},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.6461116075515747},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.5551321506500244},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5350092053413391},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.4356009364128113},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.4322899281978607},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3057050406932831},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2806679904460907},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.07197389006614685},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/access.2018.2872060","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2018.2872060","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:940663e0a8f74dbfbb1aa673ff64b3a5","is_oa":true,"landing_page_url":"https://doaj.org/article/940663e0a8f74dbfbb1aa673ff64b3a5","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"IEEE Access, Vol 6, Pp 60478-60488 (2018)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1109/access.2018.2872060","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2018.2872060","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.75}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":53,"referenced_works":["https://openalex.org/W95152782","https://openalex.org/W1492383498","https://openalex.org/W1522301498","https://openalex.org/W1563460361","https://openalex.org/W1576227399","https://openalex.org/W1963637322","https://openalex.org/W1990505856","https://openalex.org/W2099471712","https://openalex.org/W2102003408","https://openalex.org/W2125389028","https://openalex.org/W2294797155","https://openalex.org/W2423557781","https://openalex.org/W2494654097","https://openalex.org/W2516321201","https://openalex.org/W2516559027","https://openalex.org/W2519091744","https://openalex.org/W2559246505","https://openalex.org/W2591927543","https://openalex.org/W2605320104","https://openalex.org/W2658996865","https://openalex.org/W2749609415","https://openalex.org/W2749651610","https://openalex.org/W2751185861","https://openalex.org/W2759925408","https://openalex.org/W2786868129","https://openalex.org/W2794490148","https://openalex.org/W2795739483","https://openalex.org/W2799084856","https://openalex.org/W2949382160","https://openalex.org/W2962879692","https://openalex.org/W2963411216","https://openalex.org/W2963522141","https://openalex.org/W2963534259","https://openalex.org/W2963636093","https://openalex.org/W2963691546","https://openalex.org/W2963971656","https://openalex.org/W2964121744","https://openalex.org/W2964122153","https://openalex.org/W2964243274","https://openalex.org/W3142087749","https://openalex.org/W4294619240","https://openalex.org/W4295521014","https://openalex.org/W4298642009","https://openalex.org/W4298857617","https://openalex.org/W4320013936","https://openalex.org/W4385245566","https://openalex.org/W6603838645","https://openalex.org/W6696843773","https://openalex.org/W6735913928","https://openalex.org/W6736204136","https://openalex.org/W6739901393","https://openalex.org/W6745569068","https://openalex.org/W6749489859"],"related_works":["https://openalex.org/W1914543332","https://openalex.org/W2108985546","https://openalex.org/W2038801705","https://openalex.org/W2077992636","https://openalex.org/W2279374969","https://openalex.org/W1537411440","https://openalex.org/W290673751","https://openalex.org/W2017702615","https://openalex.org/W2147186888","https://openalex.org/W2024201202"],"abstract_inverted_index":{"WaveNet,":[0],"which":[1,72],"learns":[2],"directly":[3],"from":[4],"speech":[5,21,81,144],"waveform":[6],"samples,":[7],"has":[8],"been":[9],"used":[10,163],"as":[11,43,155,164,197],"an":[12,156],"alternative":[13],"to":[14,51,67,104,109,190],"vocoders":[15],"and":[16,27,47,116,159,178,194,228],"achieved":[17],"very":[18],"high-quality":[19],"synthetic":[20,80],"in":[22,31,188,223],"terms":[23,224],"of":[24,79,88,169,184,199,225],"both":[25,226],"naturalness":[26],"speaker":[28,229],"similarity":[29],"even":[30],"multi-speaker":[32,143],"text-to-speech":[33],"synthesis":[34,145],"systems.":[35],"However,":[36],"the":[37,76,86,89,111,148,165,170,175,180,210,218],"WaveNet":[38,149,187],"vocoder":[39],"uses":[40,147],"acoustic":[41,57,70,90,98,118,157,206],"features":[42,99],"local":[44,166],"condition":[45,167],"parameters,":[46],"these":[48],"parameters":[49,168],"need":[50],"be":[52],"accurately":[53],"predicted":[54,97],"by":[55,85],"another":[56],"model.":[58,91],"So":[59],"far,":[60],"it":[61],"is":[62,73,82],"not":[63],"yet":[64],"clear":[65],"how":[66],"train":[68],"this":[69],"model,":[71],"problematic":[74],"because":[75],"final":[77],"quality":[78,227],"significantly":[83],"affected":[84],"performance":[87],"Significant":[92],"degradation":[93],"occurs,":[94],"especially":[95],"when":[96],"have":[100],"mismatched":[101,112],"characteristics":[102,113],"compared":[103],"natural":[105,115],"ones.":[106],"In":[107],"order":[108],"reduce":[110],"between":[114],"generated":[117],"features,":[119],"we":[120],"propose":[121],"new":[122],"frameworks":[123,177],"that":[124,146,205],"incorporate":[125],"either":[126],"a":[127,185],"conditional":[128],"generative":[129],"adversarial":[130,195],"network":[131],"(GAN)":[132],"or":[133],"its":[134,160],"variant,":[135],"Wasserstein":[136],"GAN":[137,152,176],"with":[138],"gradient":[139],"penalty":[140],"(WGAN-GP),":[141],"into":[142],"vocoder.":[150],"The":[151],"generator":[153],"performs":[154],"model":[158],"outputs":[161],"are":[162],"WaveNet.":[171],"We":[172],"also":[173],"extend":[174],"use":[179],"discretized-mixture-of-logistics":[181],"(DML)":[182],"loss":[183,216],"well-trained":[186],"addition":[189],"mean":[191],"squared":[192],"error":[193],"losses":[196],"parts":[198],"objective":[200],"functions.":[201],"Experimental":[202],"results":[203],"show":[204],"models":[207],"trained":[208],"using":[209,213],"WGAN-GP":[211],"framework":[212],"back-propagated":[214],"DML":[215],"achieves":[217],"highest":[219],"subjective":[220],"evaluation":[221],"scores":[222],"similarity.":[230]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":13},{"year":2020,"cited_by_count":22},{"year":2019,"cited_by_count":13},{"year":2018,"cited_by_count":1}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-10T00:00:00"}
