{"id":"https://openalex.org/W4372263908","doi":"https://doi.org/10.1109/icassp49357.2023.10094745","title":"Imaginary Voice: Face-Styled Diffusion Model for Text-to-Speech","display_name":"Imaginary Voice: Face-Styled Diffusion Model for Text-to-Speech","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372263908","doi":"https://doi.org/10.1109/icassp49357.2023.10094745"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10094745","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10094745","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100447440","display_name":"Ji-Young Lee","orcid":"https://orcid.org/0000-0002-0892-3454"},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Jiyoung Lee","raw_affiliation_strings":["NAVER AI Lab,South Korea"],"affiliations":[{"raw_affiliation_string":"NAVER AI Lab,South Korea","institution_ids":["https://openalex.org/I60922564"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038723822","display_name":"Joon Son Chung","orcid":"https://orcid.org/0000-0001-7741-7275"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Joon Son Chung","raw_affiliation_strings":["Korea Advanced Institute of Science and Technology,South Korea"],"affiliations":[{"raw_affiliation_string":"Korea Advanced Institute of Science and Technology,South Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5046276512","display_name":"Soo-Whan Chung","orcid":"https://orcid.org/0000-0001-6529-2196"},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Soo-Whan Chung","raw_affiliation_strings":["NAVER Cloud,South Korea"],"affiliations":[{"raw_affiliation_string":"NAVER Cloud,South Korea","institution_ids":["https://openalex.org/I60922564"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100447440"],"corresponding_institution_ids":["https://openalex.org/I60922564"],"apc_list":null,"apc_paid":null,"fwci":3.1014,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.92547382,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7780431509017944},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6984860897064209},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.6774925589561462},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.5284782648086548},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5051448941230774},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4915193021297455},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.46773988008499146},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45574143528938293},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.44657981395721436},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3831157982349396},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.28686758875846863},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.17723524570465088}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7780431509017944},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6984860897064209},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.6774925589561462},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.5284782648086548},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5051448941230774},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4915193021297455},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.46773988008499146},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45574143528938293},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.44657981395721436},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3831157982349396},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.28686758875846863},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.17723524570465088},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10094745","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10094745","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4399999976158142,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":51,"referenced_works":["https://openalex.org/W2129069237","https://openalex.org/W2129142580","https://openalex.org/W2227324914","https://openalex.org/W2519091744","https://openalex.org/W2808631503","https://openalex.org/W2890952074","https://openalex.org/W2895976713","https://openalex.org/W2963609956","https://openalex.org/W2963801643","https://openalex.org/W2963887950","https://openalex.org/W2964243274","https://openalex.org/W2972359262","https://openalex.org/W3015338123","https://openalex.org/W3015734344","https://openalex.org/W3016663370","https://openalex.org/W3023351797","https://openalex.org/W3024147341","https://openalex.org/W3026874504","https://openalex.org/W3036167779","https://openalex.org/W3092028330","https://openalex.org/W3096806995","https://openalex.org/W3110257065","https://openalex.org/W3128910262","https://openalex.org/W3129651364","https://openalex.org/W3162926177","https://openalex.org/W3172148458","https://openalex.org/W3182657421","https://openalex.org/W4224035735","https://openalex.org/W4224920206","https://openalex.org/W4225566824","https://openalex.org/W4226125322","https://openalex.org/W4298174729","https://openalex.org/W4312933868","https://openalex.org/W4320169845","https://openalex.org/W4320459320","https://openalex.org/W6748573829","https://openalex.org/W6754850772","https://openalex.org/W6777214184","https://openalex.org/W6777694618","https://openalex.org/W6779823529","https://openalex.org/W6783182287","https://openalex.org/W6783867762","https://openalex.org/W6786375611","https://openalex.org/W6790220310","https://openalex.org/W6795261426","https://openalex.org/W6795288823","https://openalex.org/W6809885388","https://openalex.org/W6810940779","https://openalex.org/W6849953009","https://openalex.org/W6849956205","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W2081900870","https://openalex.org/W2183306018","https://openalex.org/W2549990292","https://openalex.org/W2345479200","https://openalex.org/W2951819827","https://openalex.org/W2849310602","https://openalex.org/W2419146053","https://openalex.org/W2088247287","https://openalex.org/W3006008237","https://openalex.org/W2132337154"],"abstract_inverted_index":{"The":[0,166],"goal":[1],"of":[2,29,105],"this":[3],"work":[4],"is":[5,58,122,169],"zero-shot":[6],"text-to-speech":[7,44],"synthesis,":[8],"with":[9],"speaking":[10,164],"styles":[11],"and":[12,79,89,108,142,147,162],"voices":[13],"learnt":[14,51],"from":[15,52,125,140],"facial":[16],"characteristics.":[17],"Inspired":[18],"by":[19],"the":[20,27,59,103,106,109,119,126,149,152],"natural":[21],"fact":[22],"that":[23,62],"people":[24],"can":[25],"imagine":[26],"voice":[28],"someone":[30],"when":[31],"they":[32],"look":[33],"at":[34],"his":[35],"or":[36],"her":[37],"face,":[38],"we":[39],"introduce":[40],"a":[41,48,68,72,96],"face-styled":[42],"diffusion":[43],"(TTS)":[45],"model":[46,150],"within":[47],"unified":[49],"framework":[50],"visible":[53],"attributes,":[54],"called":[55],"Face-TTS.":[56],"This":[57],"first":[60],"time":[61],"face":[63,87,127],"images":[64,88],"are":[65],"used":[66],"as":[67],"condition":[69],"to":[70,82,101,137],"train":[71,76,146],"TTS":[73,80],"model.We":[74],"jointly":[75],"cross-model":[77],"biometrics":[78],"models":[81],"preserve":[83],"speaker":[84,97,115],"identity":[85],"between":[86],"generated":[90,107],"speech":[91,112,139],"segments.":[92],"We":[93,145],"also":[94],"propose":[95],"feature":[98],"binding":[99],"loss":[100],"enforce":[102],"similarity":[104],"ground":[110],"truth":[111],"segments":[113],"in":[114],"embedding":[116],"space.":[117],"Since":[118],"biometric":[120],"information":[121],"extracted":[123],"directly":[124],"image,":[128],"our":[129],"method":[130],"does":[131],"not":[132],"require":[133],"extra":[134],"fine-tuning":[135],"steps":[136],"generate":[138],"unseen":[141],"unheard":[143],"speakers.":[144],"evaluate":[148],"on":[151],"LRS3":[153],"dataset,":[154],"an":[155],"in-the-wild":[156],"audio-visual":[157],"corpus":[158],"containing":[159],"background":[160],"noise":[161],"diverse":[163],"styles.":[165],"project":[167],"page":[168],"https://facetts.github.io.":[170]},"counts_by_year":[{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
