{"id":"https://openalex.org/W4372266915","doi":"https://doi.org/10.1109/icassp49357.2023.10096480","title":"Period VITS: Variational Inference with Explicit Pitch Modeling for End-To-End Emotional Speech Synthesis","display_name":"Period VITS: Variational Inference with Explicit Pitch Modeling for End-To-End Emotional Speech Synthesis","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372266915","doi":"https://doi.org/10.1109/icassp49357.2023.10096480"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096480","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096480","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011940388","display_name":"Yuma Shirahata","orcid":null},"institutions":[{"id":"https://openalex.org/I4210096607","display_name":"Line Corporation (Japan)","ror":"https://ror.org/00qg8pm87","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210096607","https://openalex.org/I60922564"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Yuma Shirahata","raw_affiliation_strings":["LINE Corp.,Tokyo,Japan","LINE Corp., Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"LINE Corp.,Tokyo,Japan","institution_ids":["https://openalex.org/I4210096607"]},{"raw_affiliation_string":"LINE Corp., Tokyo, Japan","institution_ids":["https://openalex.org/I4210096607"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100669485","display_name":"Ryuichi Yamamoto","orcid":"https://orcid.org/0000-0003-0299-5470"},"institutions":[{"id":"https://openalex.org/I4210096607","display_name":"Line Corporation (Japan)","ror":"https://ror.org/00qg8pm87","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210096607","https://openalex.org/I60922564"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Ryuichi Yamamoto","raw_affiliation_strings":["LINE Corp.,Tokyo,Japan","LINE Corp., Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"LINE Corp.,Tokyo,Japan","institution_ids":["https://openalex.org/I4210096607"]},{"raw_affiliation_string":"LINE Corp., Tokyo, Japan","institution_ids":["https://openalex.org/I4210096607"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104035145","display_name":"Eunwoo Song","orcid":"https://orcid.org/0000-0003-0642-7083"},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Eunwoo Song","raw_affiliation_strings":["NAVER Corp.,Seongnam,Korea","NAVER Corp., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"NAVER Corp.,Seongnam,Korea","institution_ids":["https://openalex.org/I60922564"]},{"raw_affiliation_string":"NAVER Corp., Seongnam, Korea","institution_ids":["https://openalex.org/I60922564"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064331370","display_name":"Ryo Terashima","orcid":null},"institutions":[{"id":"https://openalex.org/I4210096607","display_name":"Line Corporation (Japan)","ror":"https://ror.org/00qg8pm87","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210096607","https://openalex.org/I60922564"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Ryo Terashima","raw_affiliation_strings":["LINE Corp.,Tokyo,Japan","LINE Corp., Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"LINE Corp.,Tokyo,Japan","institution_ids":["https://openalex.org/I4210096607"]},{"raw_affiliation_string":"LINE Corp., Tokyo, Japan","institution_ids":["https://openalex.org/I4210096607"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101480212","display_name":"Jae-Min Kim","orcid":"https://orcid.org/0000-0001-9309-9369"},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jae-Min Kim","raw_affiliation_strings":["NAVER Corp.,Seongnam,Korea","NAVER Corp., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"NAVER Corp.,Seongnam,Korea","institution_ids":["https://openalex.org/I60922564"]},{"raw_affiliation_string":"NAVER Corp., Seongnam, Korea","institution_ids":["https://openalex.org/I60922564"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057222718","display_name":"Kentaro Tachibana","orcid":null},"institutions":[{"id":"https://openalex.org/I4210096607","display_name":"Line Corporation (Japan)","ror":"https://ror.org/00qg8pm87","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210096607","https://openalex.org/I60922564"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kentaro Tachibana","raw_affiliation_strings":["LINE Corp.,Tokyo,Japan","LINE Corp., Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"LINE Corp.,Tokyo,Japan","institution_ids":["https://openalex.org/I4210096607"]},{"raw_affiliation_string":"LINE Corp., Tokyo, Japan","institution_ids":["https://openalex.org/I4210096607"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5011940388"],"corresponding_institution_ids":["https://openalex.org/I4210096607"],"apc_list":null,"apc_paid":null,"fwci":2.2494,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.90135866,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.75482177734375},{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.6990020275115967},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6706687211990356},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.5917842984199524},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5657430291175842},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.5431226491928101},{"id":"https://openalex.org/keywords/cascade","display_name":"Cascade","score":0.5296468734741211},{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.5016934871673584},{"id":"https://openalex.org/keywords/jitter","display_name":"Jitter","score":0.4830258786678314},{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.4412948787212372},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.43550971150398254},{"id":"https://openalex.org/keywords/voice","display_name":"Voice","score":0.4280531406402588},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3373889923095703},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.09804585576057434}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.75482177734375},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.6990020275115967},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6706687211990356},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.5917842984199524},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5657430291175842},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.5431226491928101},{"id":"https://openalex.org/C34146451","wikidata":"https://www.wikidata.org/wiki/Q5048094","display_name":"Cascade","level":2,"score":0.5296468734741211},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.5016934871673584},{"id":"https://openalex.org/C134652429","wikidata":"https://www.wikidata.org/wiki/Q1052698","display_name":"Jitter","level":2,"score":0.4830258786678314},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.4412948787212372},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.43550971150398254},{"id":"https://openalex.org/C552089266","wikidata":"https://www.wikidata.org/wiki/Q494510","display_name":"Voice","level":2,"score":0.4280531406402588},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3373889923095703},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.09804585576057434},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096480","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096480","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.6499999761581421}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":40,"referenced_works":["https://openalex.org/W1959608418","https://openalex.org/W2029434926","https://openalex.org/W2519091744","https://openalex.org/W2593414223","https://openalex.org/W2751205669","https://openalex.org/W2908510526","https://openalex.org/W2946200149","https://openalex.org/W2963090522","https://openalex.org/W2963175743","https://openalex.org/W2963609956","https://openalex.org/W2963945466","https://openalex.org/W2964167449","https://openalex.org/W2964243274","https://openalex.org/W2968917279","https://openalex.org/W2990440871","https://openalex.org/W3015338123","https://openalex.org/W3016160783","https://openalex.org/W3033411150","https://openalex.org/W3092028330","https://openalex.org/W3161296985","https://openalex.org/W3162536450","https://openalex.org/W3169905056","https://openalex.org/W3174758275","https://openalex.org/W3196969505","https://openalex.org/W3207340675","https://openalex.org/W4224612669","https://openalex.org/W4280575909","https://openalex.org/W4286899907","https://openalex.org/W4287761884","https://openalex.org/W6610566761","https://openalex.org/W6640963894","https://openalex.org/W6687506355","https://openalex.org/W6757817989","https://openalex.org/W6763832098","https://openalex.org/W6767164110","https://openalex.org/W6778823374","https://openalex.org/W6779337556","https://openalex.org/W6783867762","https://openalex.org/W6796464841","https://openalex.org/W6802838302"],"related_works":["https://openalex.org/W169399214","https://openalex.org/W3100825170","https://openalex.org/W1914543332","https://openalex.org/W2946856121","https://openalex.org/W1927421023","https://openalex.org/W10581632","https://openalex.org/W2108985546","https://openalex.org/W2433276473","https://openalex.org/W3149582125","https://openalex.org/W2077992636"],"abstract_inverted_index":{"Several":[0],"fully":[1],"end-to-end":[2,58,122],"text-to-speech":[3],"(TTS)":[4],"models":[5,17,23,157],"have":[6,10],"been":[7],"proposed":[8,69,95,152],"that":[9,61,77,103,150],"shown":[11],"better":[12],"performance":[13],"compared":[14],"to":[15,108],"cascade":[16],"(i.e.,":[18],"training":[19],"acoustic":[20],"and":[21,46,84,127,142],"vocoder":[22],"separately).":[24],"However,":[25],"they":[26],"often":[27],"generate":[28],"unstable":[29],"pitch":[30,75,83,164],"contour":[31],"with":[32,124,162],"audible":[33],"artifacts":[34],"when":[35],"the":[36,68,88,94,105,111,114,133,151,167],"dataset":[37],"contains":[38],"emotional":[39],"attributes,":[40],"i.e.,":[41],"large":[42],"diversity":[43],"of":[44,137,160],"pronunciation":[45],"prosody.":[47],"To":[48],"address":[49],"this":[50],"problem,":[51],"we":[52,71],"propose":[53],"Period":[54],"VITS,":[55],"a":[56,73,99,131],"novel":[57],"TTS":[59],"model":[60,116,153],"incorporates":[62],"an":[63,121],"explicit":[64],"periodicity":[65,96],"generator.":[66],"In":[67],"method,":[70],"introduce":[72],"frame":[74],"predictor":[76],"predicts":[78],"prosodic":[79],"features,":[80,93],"such":[81],"as":[82],"voicing":[85],"flags,":[86],"from":[87],"input":[89],"text.":[90],"From":[91],"these":[92],"generator":[97],"produces":[98],"sample-level":[100],"sinusoidal":[101],"source":[102],"enables":[104],"waveform":[106],"decoder":[107,134],"accurately":[109],"reproduce":[110],"pitch.":[112],"Finally,":[113],"entire":[115],"is":[117],"jointly":[118],"optimized":[119],"in":[120,158,166],"manner":[123],"variational":[125],"inference":[126],"adversarial":[128],"objectives.":[129],"As":[130],"result,":[132],"becomes":[135],"capable":[136],"generating":[138],"more":[139],"stable,":[140],"expressive,":[141],"natural":[143],"output":[144],"waveforms.":[145],"The":[146],"experimental":[147],"results":[148],"showed":[149],"significantly":[154],"outperforms":[155],"baseline":[156],"terms":[159],"naturalness,":[161],"improved":[163],"stability":[165],"generated":[168],"samples.":[169]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
