{"id":"https://openalex.org/W4402301063","doi":"https://doi.org/10.1109/taslp.2024.3451951","title":"ZMM-TTS: Zero-Shot Multilingual and Multispeaker Speech Synthesis Conditioned on Self-Supervised Discrete Speech Representations","display_name":"ZMM-TTS: Zero-Shot Multilingual and Multispeaker Speech Synthesis Conditioned on Self-Supervised Discrete Speech Representations","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4402301063","doi":"https://doi.org/10.1109/taslp.2024.3451951"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3451951","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3451951","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://www.research.ed.ac.uk/files/473918569/GongEtalITASLP2024ZMMTTSZeroShotMultilingual.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5107748760","display_name":"Gong Cheng","orcid":"https://orcid.org/0009-0004-0272-3541"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Cheng Gong","raw_affiliation_strings":["Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0009-0004-0272-3541","affiliations":[{"raw_affiliation_string":"Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100327839","display_name":"Xin Wang","orcid":"https://orcid.org/0000-0001-8246-0606"},"institutions":[{"id":"https://openalex.org/I184597095","display_name":"National Institute of Informatics","ror":"https://ror.org/04ksd4g47","country_code":"JP","type":"facility","lineage":["https://openalex.org/I1319490839","https://openalex.org/I184597095","https://openalex.org/I4210158934"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Xin Wang","raw_affiliation_strings":["National Institute of Informatics (NII), Tokyo, Japan"],"raw_orcid":"https://orcid.org/0000-0001-8246-0606","affiliations":[{"raw_affiliation_string":"National Institute of Informatics (NII), Tokyo, Japan","institution_ids":["https://openalex.org/I184597095"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082349516","display_name":"Erica Cooper","orcid":"https://orcid.org/0000-0002-2978-2793"},"institutions":[{"id":"https://openalex.org/I184597095","display_name":"National Institute of Informatics","ror":"https://ror.org/04ksd4g47","country_code":"JP","type":"facility","lineage":["https://openalex.org/I1319490839","https://openalex.org/I184597095","https://openalex.org/I4210158934"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Erica Cooper","raw_affiliation_strings":["National Institute of Informatics (NII), Tokyo, Japan"],"raw_orcid":"https://orcid.org/0000-0002-2978-2793","affiliations":[{"raw_affiliation_string":"National Institute of Informatics (NII), Tokyo, Japan","institution_ids":["https://openalex.org/I184597095"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107746751","display_name":"Dan Wells","orcid":"https://orcid.org/0000-0002-4942-4248"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Dan Wells","raw_affiliation_strings":["Centre for Speech Technology Research, University of Edinburgh, Edinburgh, U.K"],"raw_orcid":"https://orcid.org/0000-0002-4942-4248","affiliations":[{"raw_affiliation_string":"Centre for Speech Technology Research, University of Edinburgh, Edinburgh, U.K","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101745213","display_name":"Longbiao Wang","orcid":"https://orcid.org/0000-0002-8094-6861"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longbiao Wang","raw_affiliation_strings":["Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0002-8094-6861","affiliations":[{"raw_affiliation_string":"Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017251198","display_name":"Jianwu Dang","orcid":"https://orcid.org/0000-0002-9237-4821"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwu Dang","raw_affiliation_strings":["Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0002-9237-4821","affiliations":[{"raw_affiliation_string":"Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055415493","display_name":"Korin Richmond","orcid":"https://orcid.org/0000-0003-1450-8270"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Korin Richmond","raw_affiliation_strings":["Centre for Speech Technology Research, University of Edinburgh, Edinburgh, U.K"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Centre for Speech Technology Research, University of Edinburgh, Edinburgh, U.K","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5007639385","display_name":"Junichi Yamagishi","orcid":"https://orcid.org/0000-0003-2752-3955"},"institutions":[{"id":"https://openalex.org/I184597095","display_name":"National Institute of Informatics","ror":"https://ror.org/04ksd4g47","country_code":"JP","type":"facility","lineage":["https://openalex.org/I1319490839","https://openalex.org/I184597095","https://openalex.org/I4210158934"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Junichi Yamagishi","raw_affiliation_strings":["National Institute of Informatics (NII), Tokyo, Japan"],"raw_orcid":"https://orcid.org/0000-0003-2752-3955","affiliations":[{"raw_affiliation_string":"National Institute of Informatics (NII), Tokyo, Japan","institution_ids":["https://openalex.org/I184597095"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5107748760"],"corresponding_institution_ids":["https://openalex.org/I162868743"],"apc_list":null,"apc_paid":null,"fwci":8.7536,"has_fulltext":false,"cited_by_count":28,"citation_normalized_percentile":{"value":0.9814377,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"32","issue":null,"first_page":"4036","last_page":"4051"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.7876222133636475},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6111071109771729},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5749954581260681},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5003161430358887},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.41814109683036804},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3647077679634094},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.2891489863395691},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.05617213249206543}],"concepts":[{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.7876222133636475},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6111071109771729},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5749954581260681},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5003161430358887},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.41814109683036804},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3647077679634094},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.2891489863395691},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.05617213249206543}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/taslp.2024.3451951","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3451951","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:pure.ed.ac.uk:openaire/2a5d477e-83c1-44d2-83ec-c1f5fae53344","is_oa":true,"landing_page_url":"https://www.research.ed.ac.uk/files/473918569/GongEtalITASLP2024ZMMTTSZeroShotMultilingual.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400321","display_name":"Edinburgh Research Explorer (University of Edinburgh)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I98677209","host_organization_name":"University of Edinburgh","host_organization_lineage":["https://openalex.org/I98677209"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Gong, C, Wang, X, Cooper, E, Wells, D, Wang, L, Dang, J, Richmond, K & Yamagishi, J 2024, 'ZMM-TTS : Zero-shot multilingual and multispeaker speech synthesis conditioned on self-supervised discrete speech representations', IEEE/ACM Transactions on Audio, Speech and Language Processing, vol. 32, pp. 4036-4051. https://doi.org/10.1109/TASLP.2024.3451951","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:pure.ed.ac.uk:publications/2a5d477e-83c1-44d2-83ec-c1f5fae53344","is_oa":true,"landing_page_url":"https://www.research.ed.ac.uk/en/publications/2a5d477e-83c1-44d2-83ec-c1f5fae53344","pdf_url":null,"source":{"id":"https://openalex.org/S4306400321","display_name":"Edinburgh Research Explorer (University of Edinburgh)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I98677209","host_organization_name":"University of Edinburgh","host_organization_lineage":["https://openalex.org/I98677209"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Gong, C, Wang, X, Cooper, E, Wells, D, Wang, L, Dang, J, Richmond, K & Yamagishi, J 2024, 'ZMM-TTS : Zero-shot multilingual and multispeaker speech synthesis conditioned on self-supervised discrete speech representations', IEEE/ACM Transactions on Audio, Speech and Language Processing, vol. 32, pp. 4036-4051. https://doi.org/10.1109/TASLP.2024.3451951","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"pmh:oai:pure.ed.ac.uk:openaire/2a5d477e-83c1-44d2-83ec-c1f5fae53344","is_oa":true,"landing_page_url":"https://www.research.ed.ac.uk/files/473918569/GongEtalITASLP2024ZMMTTSZeroShotMultilingual.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400321","display_name":"Edinburgh Research Explorer (University of Edinburgh)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I98677209","host_organization_name":"University of Edinburgh","host_organization_lineage":["https://openalex.org/I98677209"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Gong, C, Wang, X, Cooper, E, Wells, D, Wang, L, Dang, J, Richmond, K & Yamagishi, J 2024, 'ZMM-TTS : Zero-shot multilingual and multispeaker speech synthesis conditioned on self-supervised discrete speech representations', IEEE/ACM Transactions on Audio, Speech and Language Processing, vol. 32, pp. 4036-4051. https://doi.org/10.1109/TASLP.2024.3451951","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":87,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2084534958","https://openalex.org/W2187089797","https://openalex.org/W2903739847","https://openalex.org/W2963609956","https://openalex.org/W2964002616","https://openalex.org/W2964243274","https://openalex.org/W2972359262","https://openalex.org/W2972473628","https://openalex.org/W2972802841","https://openalex.org/W2973084242","https://openalex.org/W3015826515","https://openalex.org/W3048217770","https://openalex.org/W3090254849","https://openalex.org/W3095012670","https://openalex.org/W3095410713","https://openalex.org/W3097297926","https://openalex.org/W3150572638","https://openalex.org/W3161436426","https://openalex.org/W3194464626","https://openalex.org/W3196001064","https://openalex.org/W3196584150","https://openalex.org/W3197273793","https://openalex.org/W3197324626","https://openalex.org/W3197763626","https://openalex.org/W3198429080","https://openalex.org/W3205533980","https://openalex.org/W3206189675","https://openalex.org/W3207300132","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3215615641","https://openalex.org/W4225096077","https://openalex.org/W4225274946","https://openalex.org/W4225534571","https://openalex.org/W4225956675","https://openalex.org/W4226132755","https://openalex.org/W4226380987","https://openalex.org/W4226424742","https://openalex.org/W4252812408","https://openalex.org/W4281760581","https://openalex.org/W4283640572","https://openalex.org/W4296068816","https://openalex.org/W4296068817","https://openalex.org/W4297841714","https://openalex.org/W4307323391","https://openalex.org/W4313679638","https://openalex.org/W4323651091","https://openalex.org/W4366460484","https://openalex.org/W4367721746","https://openalex.org/W4372267432","https://openalex.org/W4381786045","https://openalex.org/W4382202703","https://openalex.org/W4385329631","https://openalex.org/W4385764360","https://openalex.org/W4385822479","https://openalex.org/W4385822745","https://openalex.org/W4385823466","https://openalex.org/W4388927799","https://openalex.org/W4389600306","https://openalex.org/W4390075359","https://openalex.org/W4391020683","https://openalex.org/W4392114301","https://openalex.org/W4392538788","https://openalex.org/W4392903365","https://openalex.org/W6748588790","https://openalex.org/W6750489868","https://openalex.org/W6752124048","https://openalex.org/W6752888775","https://openalex.org/W6763832098","https://openalex.org/W6778823374","https://openalex.org/W6780218876","https://openalex.org/W6783527727","https://openalex.org/W6783867762","https://openalex.org/W6796464841","https://openalex.org/W6800393981","https://openalex.org/W6803547063","https://openalex.org/W6805710207","https://openalex.org/W6811227718","https://openalex.org/W6848735303","https://openalex.org/W6850334629","https://openalex.org/W6851724922","https://openalex.org/W6853937136","https://openalex.org/W6858915148","https://openalex.org/W6862144568","https://openalex.org/W6917585676","https://openalex.org/W7062081054"],"related_works":["https://openalex.org/W3013650182","https://openalex.org/W2989283631","https://openalex.org/W4249605382","https://openalex.org/W4313491656","https://openalex.org/W3279617","https://openalex.org/W4402958497","https://openalex.org/W1991183963","https://openalex.org/W2053087750","https://openalex.org/W2146390824","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Neural":[0],"text-to-speech":[1],"(TTS)":[2],"has":[3,101,130,179],"achieved":[4],"human-like":[5],"synthetic":[6],"speech":[7,77,96,136],"for":[8,54,94,107,112,140,195],"single-speaker,":[9],"single-language":[10],"synthesis.":[11,97],"Multilingual":[12],"TTS":[13,32],"systems":[14,33,49],"are":[15,34,165],"limited":[16],"to":[17,21,185],"resource-rich":[18],"languages":[19],"due":[20],"the":[22,153,186,196],"lack":[23],"of":[24,62,126,135,155,183],"large":[25],"paired":[26],"text":[27],"and":[28,71,89,120,138,143,178],"studio-quality":[29],"audio":[30,174],"data.":[31],"typically":[35],"built":[36],"using":[37,57],"a":[38,59,69,80,124,180],"single":[39],"speaker's":[40,188],"voice,":[41,189],"but":[42,110],"there":[43],"is":[44,176],"growing":[45],"interest":[46],"in":[47,133,146],"developing":[48],"that":[50,168,175],"can":[51,172],"synthesize":[52,173],"voices":[53],"new":[55],"speakers":[56,109,145],"only":[58,106],"few":[60],"seconds":[61],"their":[63],"speech.":[64],"This":[65],"paper":[66,86],"presents":[67],"ZMM-TTS,":[68],"multilingual":[70,95],"multispeaker":[72],"framework":[73],"utilizing":[74],"quantized":[75],"latent":[76],"representations":[78],"from":[79],"large-scale,":[81],"pre-trained,":[82],"self-supervised":[83,91],"model.":[84],"Our":[85,98,128],"combines":[87],"text-based":[88],"speech-based":[90],"learning":[92],"models":[93],"proposed":[99,170],"model":[100,129],"zero-shot":[102],"generalization":[103],"ability":[104],"not":[105],"unseen":[108,113,144,198],"also":[111,151],"languages.":[114,149,162],"We":[115,150],"have":[116],"conducted":[117],"comprehensive":[118],"subjective":[119],"objective":[121],"evaluations":[122],"through":[123],"series":[125],"experiments.":[127],"proven":[131],"effective":[132],"terms":[134],"naturalness":[137],"similarity":[139,184],"both":[141],"seen":[142],"six":[147],"high-resource":[148],"tested":[152],"efficiency":[154],"our":[156,169],"method":[157],"on":[158],"two":[159],"hypothetically":[160],"low-resource":[161],"The":[163],"results":[164],"promising,":[166],"indicating":[167],"approach":[171],"intelligible":[177],"high":[181],"degree":[182],"target":[187],"even":[190],"without":[191],"any":[192],"training":[193],"data":[194],"new,":[197],"language.":[199]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":15},{"year":2024,"cited_by_count":8}],"updated_date":"2026-05-28T09:10:13.091523","created_date":"2025-10-10T00:00:00"}
