{"id":"https://openalex.org/W4392124090","doi":"https://doi.org/10.1109/taslp.2024.3369528","title":"EfficientTTS 2: Variational End-to-End Text-to-Speech Synthesis and Voice Conversion","display_name":"EfficientTTS 2: Variational End-to-End Text-to-Speech Synthesis and Voice Conversion","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4392124090","doi":"https://doi.org/10.1109/taslp.2024.3369528"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3369528","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3369528","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/10304349/10444060.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://ieeexplore.ieee.org/ielx7/6570655/10304349/10444060.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093993569","display_name":"Chenfeng Miao","orcid":"https://orcid.org/0009-0006-1986-3035"},"institutions":[{"id":"https://openalex.org/I4401726822","display_name":"Ping An (China)","ror":"https://ror.org/004yv2z91","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726822"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chenfeng Miao","raw_affiliation_strings":["Ping An Technology, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0006-1986-3035","affiliations":[{"raw_affiliation_string":"Ping An Technology, Shanghai, China","institution_ids":["https://openalex.org/I4401726822"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087698068","display_name":"Qingying Zhu","orcid":"https://orcid.org/0009-0003-0155-0709"},"institutions":[{"id":"https://openalex.org/I4401726822","display_name":"Ping An (China)","ror":"https://ror.org/004yv2z91","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726822"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qingying Zhu","raw_affiliation_strings":["Ping An Technology, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0003-0155-0709","affiliations":[{"raw_affiliation_string":"Ping An Technology, Shanghai, China","institution_ids":["https://openalex.org/I4401726822"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059638789","display_name":"Minchuan Chen","orcid":"https://orcid.org/0009-0001-1512-6672"},"institutions":[{"id":"https://openalex.org/I4401726822","display_name":"Ping An (China)","ror":"https://ror.org/004yv2z91","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726822"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minchuan Chen","raw_affiliation_strings":["Ping An Technology, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0001-1512-6672","affiliations":[{"raw_affiliation_string":"Ping An Technology, Shanghai, China","institution_ids":["https://openalex.org/I4401726822"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100643103","display_name":"Jun Ma","orcid":"https://orcid.org/0009-0003-8713-0667"},"institutions":[{"id":"https://openalex.org/I4401726822","display_name":"Ping An (China)","ror":"https://ror.org/004yv2z91","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726822"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Ma","raw_affiliation_strings":["Ping An Technology, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0003-8713-0667","affiliations":[{"raw_affiliation_string":"Ping An Technology, Shanghai, China","institution_ids":["https://openalex.org/I4401726822"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100702017","display_name":"Shaojun Wang","orcid":"https://orcid.org/0009-0001-8955-8566"},"institutions":[{"id":"https://openalex.org/I4401726822","display_name":"Ping An (China)","ror":"https://ror.org/004yv2z91","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726822"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shaojun Wang","raw_affiliation_strings":["Ping An Technology, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0001-8955-8566","affiliations":[{"raw_affiliation_string":"Ping An Technology, Shanghai, China","institution_ids":["https://openalex.org/I4401726822"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016038454","display_name":"Jing Xiao","orcid":"https://orcid.org/0000-0001-9615-4749"},"institutions":[{"id":"https://openalex.org/I4401726822","display_name":"Ping An (China)","ror":"https://ror.org/004yv2z91","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726822"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Xiao","raw_affiliation_strings":["Ping An Technology, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-9615-4749","affiliations":[{"raw_affiliation_string":"Ping An Technology, Shanghai, China","institution_ids":["https://openalex.org/I4401726822"]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5093993569"],"corresponding_institution_ids":["https://openalex.org/I4401726822"],"apc_list":null,"apc_paid":null,"fwci":4.6359,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.95151234,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"32","issue":null,"first_page":"1650","last_page":"1661"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7553942203521729},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.6256190538406372},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6187795400619507},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.5754533410072327},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.5281382203102112},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4751405715942383},{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.45279017090797424},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.424969345331192},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3069797158241272},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.06984603404998779}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7553942203521729},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.6256190538406372},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6187795400619507},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.5754533410072327},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.5281382203102112},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4751405715942383},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.45279017090797424},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.424969345331192},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3069797158241272},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.06984603404998779},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3369528","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3369528","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/10304349/10444060.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1109/taslp.2024.3369528","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3369528","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/10304349/10444060.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4392124090.pdf"},"referenced_works_count":40,"referenced_works":["https://openalex.org/W1583912456","https://openalex.org/W2105482032","https://openalex.org/W2107860279","https://openalex.org/W2747874407","https://openalex.org/W2808631503","https://openalex.org/W2963091184","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2972359262","https://openalex.org/W2998572311","https://openalex.org/W3015282541","https://openalex.org/W3015338123","https://openalex.org/W3090254849","https://openalex.org/W3091928890","https://openalex.org/W3161782335","https://openalex.org/W3197294703","https://openalex.org/W3198234802","https://openalex.org/W3198769980","https://openalex.org/W4385245566","https://openalex.org/W4391020683","https://openalex.org/W6635084905","https://openalex.org/W6640963894","https://openalex.org/W6745697700","https://openalex.org/W6753855596","https://openalex.org/W6757817989","https://openalex.org/W6763832098","https://openalex.org/W6772349387","https://openalex.org/W6777694618","https://openalex.org/W6778823374","https://openalex.org/W6779337556","https://openalex.org/W6779823529","https://openalex.org/W6783527727","https://openalex.org/W6783867762","https://openalex.org/W6784545093","https://openalex.org/W6786494455","https://openalex.org/W6787300339","https://openalex.org/W6795261426","https://openalex.org/W6796464841","https://openalex.org/W6805710207","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W2151749779","https://openalex.org/W1974895211","https://openalex.org/W2129841057","https://openalex.org/W3040712279","https://openalex.org/W2176409448","https://openalex.org/W2364769705","https://openalex.org/W2056136368","https://openalex.org/W3179968364","https://openalex.org/W2374664672","https://openalex.org/W2039489009"],"abstract_inverted_index":{"Recently,":[0],"the":[1,64,67,89,110],"field":[2],"of":[3,69],"Text-to-Speech":[4],"(TTS)":[5],"has":[6],"been":[7],"dominated":[8],"by":[9],"one-stage":[10,32],"text-to-waveform":[11],"models":[12,113],"which":[13],"have":[14],"significantly":[15],"improved":[16],"speech":[17,120],"quality":[18,121],"compared":[19,122],"to":[20,88,123],"two-stage":[21],"models.":[22],"In":[23],"this":[24],"work,":[25],"we":[26,85],"propose":[27,95],"EfficientTTS":[28],"2":[29],"(EFTS2),":[30],"a":[31,52,56],"high-quality":[33,103],"end-to-end":[34,98],"TTS":[35,81],"framework":[36],"that":[37,101,109],"is":[38],"fully":[39],"differentiable":[40,53],"and":[41,55,74,94,132],"highly":[42],"efficient.":[43],"Our":[44],"method":[45],"adopts":[46],"an":[47,97],"adversarial":[48],"training":[49,76],"process,":[50],"with":[51],"aligner":[54],"hierarchical-VAE-based":[57],"waveform":[58],"generator.":[59],"These":[60],"design":[61],"choices":[62],"free":[63],"model":[65,100,134],"from":[66],"use":[68],"external":[70],"aligners,":[71],"invertible":[72],"structures,":[73],"complex":[75],"procedures":[77],"as":[78],"most":[79],"previous":[80],"works":[82],"have.":[83],"Moreover,":[84],"extend":[86],"EFTS2":[87],"voice":[90],"conversion":[91],"(VC)":[92],"task":[93],"EFTS2-VC,":[96],"VC":[99],"allows":[102],"speech-to-speech":[104],"conversion.":[105],"Experimental":[106],"results":[107],"suggest":[108],"two":[111],"proposed":[112],"achieve":[114],"better":[115],"or":[116],"at":[117],"least":[118],"comparable":[119],"baseline":[124],"models,":[125],"while":[126],"also":[127],"providing":[128],"faster":[129],"inference":[130],"speeds":[131],"smaller":[133],"sizes.":[135]},"counts_by_year":[{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
