{"id":"https://openalex.org/W4392904630","doi":"https://doi.org/10.1109/icassp48485.2024.10446191","title":"PAVITS: Exploring Prosody-Aware VITS for End-to-End Emotional Voice Conversion","display_name":"PAVITS: Exploring Prosody-Aware VITS for End-to-End Emotional Voice Conversion","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392904630","doi":"https://doi.org/10.1109/icassp48485.2024.10446191"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446191","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446191","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040782290","display_name":"Tianhua Qi","orcid":"https://orcid.org/0009-0005-5780-9374"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Tianhua Qi","raw_affiliation_strings":["Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096","School of Biological Science and Medical Engineering, Southeast University, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096","institution_ids":["https://openalex.org/I76569877"]},{"raw_affiliation_string":"School of Biological Science and Medical Engineering, Southeast University, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029771864","display_name":"Wenming Zheng","orcid":"https://orcid.org/0000-0002-7764-5179"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenming Zheng","raw_affiliation_strings":["Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096","School of Biological Science and Medical Engineering, Southeast University, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096","institution_ids":["https://openalex.org/I76569877"]},{"raw_affiliation_string":"School of Biological Science and Medical Engineering, Southeast University, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054796879","display_name":"Cheng Lu","orcid":"https://orcid.org/0000-0002-1477-1020"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cheng Lu","raw_affiliation_strings":["Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096","School of Biological Science and Medical Engineering, Southeast University, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096","institution_ids":["https://openalex.org/I76569877"]},{"raw_affiliation_string":"School of Biological Science and Medical Engineering, Southeast University, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027316177","display_name":"Yuan Zong","orcid":"https://orcid.org/0000-0002-0839-8792"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuan Zong","raw_affiliation_strings":["Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096","School of Biological Science and Medical Engineering, Southeast University, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096","institution_ids":["https://openalex.org/I76569877"]},{"raw_affiliation_string":"School of Biological Science and Medical Engineering, Southeast University, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5091060125","display_name":"Hailun Lian","orcid":"https://orcid.org/0000-0002-1355-9503"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hailun Lian","raw_affiliation_strings":["Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Child Development and Learning Science (Southeast University),Ministry of Education,Nanjing,China,210096","institution_ids":["https://openalex.org/I76569877"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5040782290"],"corresponding_institution_ids":["https://openalex.org/I76569877"],"apc_list":null,"apc_paid":null,"fwci":3.6216,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.93566508,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"12697","last_page":"12701"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.9811999201774597},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.9534807205200195},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7051318883895874},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6279050707817078},{"id":"https://openalex.org/keywords/emotional-prosody","display_name":"Emotional prosody","score":0.5420137643814087},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5005886554718018},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.47368481755256653},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.46912169456481934},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.37611210346221924},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.34582123160362244},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.22899377346038818}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.9811999201774597},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.9534807205200195},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7051318883895874},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6279050707817078},{"id":"https://openalex.org/C2778262033","wikidata":"https://www.wikidata.org/wiki/Q5373795","display_name":"Emotional prosody","level":3,"score":0.5420137643814087},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5005886554718018},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.47368481755256653},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.46912169456481934},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.37611210346221924},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34582123160362244},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.22899377346038818},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446191","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446191","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.4300000071525574}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2102298245","https://openalex.org/W2149628368","https://openalex.org/W2166821106","https://openalex.org/W2937579788","https://openalex.org/W3015719316","https://openalex.org/W3034600949","https://openalex.org/W3128401974","https://openalex.org/W3162791003","https://openalex.org/W3162993161","https://openalex.org/W3163573274","https://openalex.org/W3196643119","https://openalex.org/W3197993066","https://openalex.org/W3207340675","https://openalex.org/W4200225196","https://openalex.org/W4205742757","https://openalex.org/W4221147462","https://openalex.org/W4226474318","https://openalex.org/W4286515285","https://openalex.org/W4296069352","https://openalex.org/W4296334967","https://openalex.org/W4312637065","https://openalex.org/W4361994820","https://openalex.org/W4372266915","https://openalex.org/W6778823374","https://openalex.org/W6779337556","https://openalex.org/W6796464841"],"related_works":["https://openalex.org/W169399214","https://openalex.org/W2414392375","https://openalex.org/W4391272374","https://openalex.org/W2112134350","https://openalex.org/W4205555064","https://openalex.org/W79954020","https://openalex.org/W3106706667","https://openalex.org/W81388513","https://openalex.org/W2442195447","https://openalex.org/W2169632867"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3,46,69,97,113,131],"propose":[4,114],"Prosody-aware":[5],"VITS":[6],"(PAVITS)":[7],"for":[8,31],"emotional":[9,26,78,95],"voice":[10],"conversion":[11,83],"(EVC),":[12],"aiming":[13],"to":[14,102,137,162],"achieve":[15],"two":[16,146],"major":[17],"objectives":[18],"of":[19,35,43,59,75,108,158],"EVC:":[20],"high":[21,25,56],"content":[22,41],"naturalness":[23,42],"and":[24,67,81],"naturalness,":[27,96],"which":[28,118],"are":[29,169],"crucial":[30],"meeting":[32],"the":[33,40,55,72,94,104,126,156,163],"demands":[34],"human":[36],"perception.":[37],"To":[38,91],"improve":[39],"converted":[44],"audio,":[45],"have":[47],"developed":[48],"an":[49,64,99],"end-to-end":[50],"EVC":[51,89,165],"architecture":[52],"inspired":[53],"by":[54],"audio":[57],"quality":[58],"VITS.":[60],"By":[61],"seamlessly":[62],"integrating":[63],"acoustic":[65],"converter":[66],"vocoder,":[68],"effectively":[70],"address":[71],"common":[73],"issue":[74],"mismatch":[76],"between":[77,141],"prosody":[79,106,116,120,134,143],"training":[80],"run-time":[82],"that":[84,155],"is":[85,160],"prevalent":[86],"in":[87],"existing":[88],"models.":[90],"further":[92],"enhance":[93],"introduce":[98,132],"emotion":[100,128],"descriptor":[101],"model":[103],"subtle":[105],"variations":[107],"different":[109],"speech":[110],"emotions.":[111],"Additionally,":[112],"a":[115,133,139],"predictor,":[117],"predicts":[119],"features":[121,144],"from":[122,145],"text":[123],"based":[124],"on":[125],"provided":[127],"label.":[129],"Notably,":[130],"alignment":[135],"loss":[136],"establish":[138],"connection":[140],"latent":[142],"distinct":[147],"modalities,":[148],"ensuring":[149],"effective":[150],"training.":[151],"Experimental":[152],"results":[153],"show":[154],"performance":[157],"PAVITS":[159],"superior":[161],"state-of-the-art":[164],"methods.":[166],"Speech":[167],"Samples":[168],"available":[170],"at":[171],"https://jeremychee4.github.io/pavits4EVC/.":[172]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
