{"id":"https://openalex.org/W4392903591","doi":"https://doi.org/10.1109/icassp48485.2024.10445804","title":"Promptvc: Flexible Stylistic Voice Conversion in Latent Space Driven by Natural Language Prompts","display_name":"Promptvc: Flexible Stylistic Voice Conversion in Latent Space Driven by Natural Language Prompts","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903591","doi":"https://doi.org/10.1109/icassp48485.2024.10445804"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10445804","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10445804","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015560758","display_name":"Jixun Yao","orcid":"https://orcid.org/0000-0002-5324-7360"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jixun Yao","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU) School of Computer Science,Xi&#x2019;an,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU) School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101516876","display_name":"Yuguang Yang","orcid":"https://orcid.org/0009-0003-3892-0523"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuguang Yang","raw_affiliation_strings":["Ximalaya Inc,China","Ximalaya Inc, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ximalaya Inc,China","institution_ids":[]},{"raw_affiliation_string":"Ximalaya Inc, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013928267","display_name":"Yi Lei","orcid":"https://orcid.org/0000-0002-9256-9311"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Lei","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU) School of Computer Science,Xi&#x2019;an,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU) School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081164682","display_name":"Ziqian Ning","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziqian Ning","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU) School of Computer Science,Xi&#x2019;an,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU) School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101250603","display_name":"Yanni Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yanni Hu","raw_affiliation_strings":["Ximalaya Inc,China","Ximalaya Inc, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ximalaya Inc,China","institution_ids":[]},{"raw_affiliation_string":"Ximalaya Inc, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010088695","display_name":"Yu Pan","orcid":"https://orcid.org/0000-0001-7261-8297"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu Pan","raw_affiliation_strings":["Ximalaya Inc,China","Ximalaya Inc, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ximalaya Inc,China","institution_ids":[]},{"raw_affiliation_string":"Ximalaya Inc, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004857861","display_name":"Jingjing Yin","orcid":"https://orcid.org/0000-0003-4843-613X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jingjing Yin","raw_affiliation_strings":["Ximalaya Inc,China","Ximalaya Inc, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ximalaya Inc,China","institution_ids":[]},{"raw_affiliation_string":"Ximalaya Inc, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110823442","display_name":"Hongbin Zhou","orcid":"https://orcid.org/0009-0005-7569-1197"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hongbin Zhou","raw_affiliation_strings":["Ximalaya Inc,China","Ximalaya Inc, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ximalaya Inc,China","institution_ids":[]},{"raw_affiliation_string":"Ximalaya Inc, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100618443","display_name":"Heng Lu","orcid":"https://orcid.org/0009-0009-9236-8825"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heng Lu","raw_affiliation_strings":["Ximalaya Inc,China","Ximalaya Inc, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ximalaya Inc,China","institution_ids":[]},{"raw_affiliation_string":"Ximalaya Inc, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU) School of Computer Science,Xi&#x2019;an,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU) School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.2765,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.94656136,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"10571","last_page":"10575"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9843000173568726,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7389774918556213},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5070550441741943},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.4835493266582489},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.46963998675346375},{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.4476270079612732},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.43416842818260193},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.402086466550827}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7389774918556213},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5070550441741943},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.4835493266582489},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.46963998675346375},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.4476270079612732},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.43416842818260193},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.402086466550827},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10445804","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10445804","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W1574447377","https://openalex.org/W2067295501","https://openalex.org/W2107860279","https://openalex.org/W2519091744","https://openalex.org/W2938833595","https://openalex.org/W2947445680","https://openalex.org/W3096939667","https://openalex.org/W3098557217","https://openalex.org/W3163573274","https://openalex.org/W3209059054","https://openalex.org/W3216296943","https://openalex.org/W4205742757","https://openalex.org/W4225264140","https://openalex.org/W4226474318","https://openalex.org/W4283689139","https://openalex.org/W4312933868","https://openalex.org/W4372260157","https://openalex.org/W4372260214","https://openalex.org/W4372267192","https://openalex.org/W4375869257","https://openalex.org/W4385822787","https://openalex.org/W4398152753","https://openalex.org/W6750489868","https://openalex.org/W6763832098","https://openalex.org/W6778823374","https://openalex.org/W6795807602","https://openalex.org/W6796464841"],"related_works":["https://openalex.org/W2905433371","https://openalex.org/W2888392564","https://openalex.org/W4310278675","https://openalex.org/W4388422664","https://openalex.org/W4390569940","https://openalex.org/W4361193272","https://openalex.org/W2963326959","https://openalex.org/W4226226396","https://openalex.org/W3153750606","https://openalex.org/W4308854837"],"abstract_inverted_index":{"Stylistic":[0],"voice":[1,24,68],"conversion":[2,25,37,69],"aims":[3],"to":[4,11,16,34,41,77,108,131,143,167,184],"transform":[5],"the":[6,21,36,52,88,101,110,139,146,157,169,177,180,194],"style":[7,14,23,44,57,67,80,89,95,111,126,152],"of":[8,51,56,171,179,196],"source":[9],"speech":[10,33],"a":[12,65,73,79,94,163],"desired":[13],"according":[15],"real-world":[17],"application":[18],"demands.":[19],"However,":[20],"current":[22],"approach":[26,70],"relies":[27],"on":[28,120],"pre-defined":[29],"labels":[30],"or":[31,46],"reference":[32],"control":[35],"process,":[38],"which":[39,149,174],"leads":[40],"limitations":[42],"in":[43,49],"diversity":[45],"falls":[47],"short":[48],"terms":[50],"intuitive":[53],"and":[54,99,135,161,189],"interpretability":[55],"representation.":[58],"In":[59],"this":[60,116],"study,":[61],"we":[62,128,155],"propose":[63],"PromptVC,":[64],"novel":[66],"that":[71],"employs":[72],"latent":[74,102],"diffusion":[75,103],"model":[76,104],"generate":[78],"vector":[81,90,112],"driven":[82],"by":[83,93],"natural":[84,121],"language":[85,122],"prompts.":[86,123],"Specifically,":[87],"is":[91,105],"extracted":[92],"encoder":[96],"during":[97],"training,":[98],"then":[100],"trained":[106],"independently":[107],"sample":[109],"from":[113],"noise,":[114],"with":[115,138],"process":[117],"being":[118],"conditioned":[119],"To":[124],"improve":[125],"expressiveness,":[127],"leverage":[129],"HuBERT":[130],"extract":[132],"discrete":[133,159],"tokens":[134],"replace":[136],"them":[137],"K-Means":[140],"center":[141],"embedding":[142],"serve":[144],"as":[145],"linguistic":[147,182],"content,":[148],"minimizes":[150],"residual":[151],"information.":[153],"Additionally,":[154],"deduplicate":[156],"same":[158,181],"token":[160],"employ":[162],"differentiable":[164],"duration":[165,170,178],"predictor":[166],"re-predict":[168],"each":[172],"token,":[173],"can":[175],"adapt":[176],"content":[183],"different":[185],"styles.":[186],"The":[187],"subjective":[188],"objective":[190],"evaluation":[191],"results":[192],"demonstrate":[193],"effectiveness":[195],"our":[197],"proposed":[198],"system.":[199]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
