{"id":"https://openalex.org/W4375869257","doi":"https://doi.org/10.1109/icassp49357.2023.10096285","title":"Prompttts: Controllable Text-To-Speech With Text Descriptions","display_name":"Prompttts: Controllable Text-To-Speech With Text Descriptions","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4375869257","doi":"https://doi.org/10.1109/icassp49357.2023.10096285"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096285","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096285","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5022081273","display_name":"Zhifang Guo","orcid":"https://orcid.org/0000-0002-3728-6319"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]},{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]},{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN","US"],"is_corresponding":true,"raw_author_name":"Zhifang Guo","raw_affiliation_strings":["Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China"],"affiliations":[{"raw_affiliation_string":"Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369","https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China","institution_ids":["https://openalex.org/I78988378","https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053168506","display_name":"Yichong Leng","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]},{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]},{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Yichong Leng","raw_affiliation_strings":["Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China"],"affiliations":[{"raw_affiliation_string":"Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369","https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China","institution_ids":["https://openalex.org/I78988378","https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054879184","display_name":"Yihan Wu","orcid":"https://orcid.org/0000-0002-1202-4208"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]},{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]},{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Yihan Wu","raw_affiliation_strings":["Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China"],"affiliations":[{"raw_affiliation_string":"Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369","https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China","institution_ids":["https://openalex.org/I78988378","https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100329353","display_name":"Sheng Zhao","orcid":"https://orcid.org/0000-0002-9624-5381"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]},{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]},{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Sheng Zhao","raw_affiliation_strings":["Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China"],"affiliations":[{"raw_affiliation_string":"Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369","https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China","institution_ids":["https://openalex.org/I78988378","https://openalex.org/I4210113369"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101522530","display_name":"Xu Tan","orcid":"https://orcid.org/0000-0001-5631-0639"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]},{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]},{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Xu Tan","raw_affiliation_strings":["Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China"],"affiliations":[{"raw_affiliation_string":"Univerisity of Science and Technology of China, Renmin University of China,Microsoft Azure Speech, Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369","https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Azure Speech, Microsoft Research Asia, Univerisity of Science and Technology of China, Renmin University of China","institution_ids":["https://openalex.org/I78988378","https://openalex.org/I4210113369"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5022081273"],"corresponding_institution_ids":["https://openalex.org/I1290206253","https://openalex.org/I4210113369","https://openalex.org/I78988378"],"apc_list":null,"apc_paid":null,"fwci":20.3338,"has_fulltext":false,"cited_by_count":117,"citation_normalized_percentile":{"value":0.9958879,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.7468693256378174},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7373679876327515},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.6243112087249756},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5824432373046875},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5791040658950806},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.5549759864807129},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.522443950176239},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.47487589716911316},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4366738796234131}],"concepts":[{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.7468693256378174},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7373679876327515},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.6243112087249756},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5824432373046875},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5791040658950806},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.5549759864807129},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.522443950176239},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.47487589716911316},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4366738796234131},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096285","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096285","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W160640889","https://openalex.org/W349236604","https://openalex.org/W2405756170","https://openalex.org/W2794490148","https://openalex.org/W2896457183","https://openalex.org/W2972359262","https://openalex.org/W2972677740","https://openalex.org/W3033411150","https://openalex.org/W3034445277","https://openalex.org/W3092028330","https://openalex.org/W3094650042","https://openalex.org/W3158631574","https://openalex.org/W3174758275","https://openalex.org/W3176617251","https://openalex.org/W3197541421","https://openalex.org/W4221167022","https://openalex.org/W4224035735","https://openalex.org/W4280561221","https://openalex.org/W4285247752","https://openalex.org/W4292779060","https://openalex.org/W6713645886","https://openalex.org/W6750489868","https://openalex.org/W6755207826","https://openalex.org/W6778823374","https://openalex.org/W6778883912","https://openalex.org/W6779879114","https://openalex.org/W6783867762"],"related_works":["https://openalex.org/W1927421023","https://openalex.org/W10581632","https://openalex.org/W3149582125","https://openalex.org/W1984347656","https://openalex.org/W1965141925","https://openalex.org/W2465421051","https://openalex.org/W652196294","https://openalex.org/W2368700418","https://openalex.org/W2587342322","https://openalex.org/W2540115864"],"abstract_inverted_index":{"Using":[0],"a":[1,46,55,74,78,90,135,170],"text":[2,11,24,37,132],"description":[3],"as":[4,51,63,123],"prompt":[5,56],"to":[6,39,65,81,93,97,114,118,139,147,160],"guide":[7,40],"the":[8,33,67,83,87,98,162,180],"generation":[9],"of":[10,35,73,164],"or":[12,16],"images":[13],"(e.g.,":[14,143],"GPT-3":[15],"DALLE-2)":[17],"has":[18],"drawn":[19],"wide":[20],"attention":[21],"recently.":[22],"Beyond":[23],"and":[25,60,77,89,101,125,168,176,179,194,200],"image":[26],"generation,":[27],"in":[28,108],"this":[29],"work,":[30],"we":[31,44,166],"explore":[32],"possibility":[34],"utilizing":[36],"descriptions":[38,62,133],"speech":[41,91,95,141,189,196],"synthesis.":[42],"Thus,":[43],"develop":[45],"text-to-speech":[47],"(TTS)":[48],"system":[49],"(dubbed":[50],"PromptTTS)":[52],"that":[53,111,152,185],"takes":[54],"with":[57,105,158,174,190],"both":[58],"style":[59,75,100,120,142,175,192],"content":[61,79,102,177],"input":[64],"synthesize":[66,94],"corresponding":[68,84,181],"speech.":[69,182],"Specifically,":[70],"PromptTTS":[71,127,186],"consists":[72],"encoder":[76,80],"extract":[82],"representations":[85],"from":[86],"prompt,":[88],"decoder":[92],"according":[96],"extracted":[99],"representations.":[103],"Compared":[104],"previous":[106],"works":[107],"controllable":[109],"TTS":[110,156],"require":[112],"users":[113],"have":[115],"acoustic":[116],"knowledge":[117],"understand":[119],"factors":[121],"such":[122],"prosody":[124],"pitch,":[126],"is":[128,154],"more":[129,136],"user-friendly":[130],"since":[131],"are":[134,203],"natural":[137],"way":[138],"express":[140],"\"A":[144],"lady":[145],"whispers":[146],"her":[148],"friend":[149],"slowly\").":[150],"Given":[151],"there":[153],"no":[155],"dataset":[157,171,202],"prompts,":[159],"benchmark":[161],"task":[163],"PromptTTS,":[165],"construct":[167],"release":[169],"containing":[172],"prompts":[173],"information":[178],"Experiments":[183],"show":[184],"can":[187],"generate":[188],"precise":[191],"control":[193],"high":[195],"quality.":[197],"Audio":[198],"samples":[199],"our":[201],"publicly":[204],"available":[205],"<sup":[206],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[207],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[208],".":[209]},"counts_by_year":[{"year":2026,"cited_by_count":53},{"year":2025,"cited_by_count":32},{"year":2024,"cited_by_count":23},{"year":2023,"cited_by_count":9}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
