{"id":"https://openalex.org/W4405709658","doi":"https://doi.org/10.1109/iscslp63861.2024.10800185","title":"Bridging Facial Imagery and Vocal Reality: Stable Diffusion-Enhanced Voice Generation","display_name":"Bridging Facial Imagery and Vocal Reality: Stable Diffusion-Enhanced Voice Generation","publication_year":2024,"publication_date":"2024-11-07","ids":{"openalex":"https://openalex.org/W4405709658","doi":"https://doi.org/10.1109/iscslp63861.2024.10800185"},"language":"en","primary_location":{"id":"doi:10.1109/iscslp63861.2024.10800185","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscslp63861.2024.10800185","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100768194","display_name":"Yueqian Lin","orcid":"https://orcid.org/0000-0003-1473-8981"},"institutions":[{"id":"https://openalex.org/I4210159968","display_name":"Duke Kunshan University","ror":"https://ror.org/04sr5ys16","country_code":"CN","type":"education","lineage":["https://openalex.org/I170897317","https://openalex.org/I37461747","https://openalex.org/I4210159968"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yueqian Lin","raw_affiliation_strings":["Data Science Research Center, Duke Kunshan University,Kunshan"],"affiliations":[{"raw_affiliation_string":"Data Science Research Center, Duke Kunshan University,Kunshan","institution_ids":["https://openalex.org/I4210159968"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032176468","display_name":"Dong Liu","orcid":"https://orcid.org/0000-0003-4346-9565"},"institutions":[{"id":"https://openalex.org/I4210159968","display_name":"Duke Kunshan University","ror":"https://ror.org/04sr5ys16","country_code":"CN","type":"education","lineage":["https://openalex.org/I170897317","https://openalex.org/I37461747","https://openalex.org/I4210159968"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dong Liu","raw_affiliation_strings":["Data Science Research Center, Duke Kunshan University,Kunshan"],"affiliations":[{"raw_affiliation_string":"Data Science Research Center, Duke Kunshan University,Kunshan","institution_ids":["https://openalex.org/I4210159968"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031419527","display_name":"Yunfei Xu","orcid":"https://orcid.org/0000-0002-7397-811X"},"institutions":[{"id":"https://openalex.org/I180662265","display_name":"China Mobile (China)","ror":"https://ror.org/05gftfe97","country_code":"CN","type":"company","lineage":["https://openalex.org/I180662265"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunfei Xu","raw_affiliation_strings":["Guangdong OPPO Mobile Telecommunications Corp., Ltd"],"affiliations":[{"raw_affiliation_string":"Guangdong OPPO Mobile Telecommunications Corp., Ltd","institution_ids":["https://openalex.org/I180662265"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109630497","display_name":"Hongbin Suo","orcid":null},"institutions":[{"id":"https://openalex.org/I180662265","display_name":"China Mobile (China)","ror":"https://ror.org/05gftfe97","country_code":"CN","type":"company","lineage":["https://openalex.org/I180662265"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongbin Suo","raw_affiliation_strings":["Guangdong OPPO Mobile Telecommunications Corp., Ltd"],"affiliations":[{"raw_affiliation_string":"Guangdong OPPO Mobile Telecommunications Corp., Ltd","institution_ids":["https://openalex.org/I180662265"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5092339711","display_name":"Ming Li","orcid":null},"institutions":[{"id":"https://openalex.org/I4210159968","display_name":"Duke Kunshan University","ror":"https://ror.org/04sr5ys16","country_code":"CN","type":"education","lineage":["https://openalex.org/I170897317","https://openalex.org/I37461747","https://openalex.org/I4210159968"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ming Li","raw_affiliation_strings":["Data Science Research Center, Duke Kunshan University,Kunshan"],"affiliations":[{"raw_affiliation_string":"Data Science Research Center, Duke Kunshan University,Kunshan","institution_ids":["https://openalex.org/I4210159968"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100768194"],"corresponding_institution_ids":["https://openalex.org/I4210159968"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.245958,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"229","last_page":"233"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9142000079154968,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9142000079154968,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.7666417360305786},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5717871189117432},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.42748284339904785},{"id":"https://openalex.org/keywords/vocal-folds","display_name":"Vocal folds","score":0.42315560579299927},{"id":"https://openalex.org/keywords/medicine","display_name":"Medicine","score":0.06452053785324097},{"id":"https://openalex.org/keywords/larynx","display_name":"Larynx","score":0.059684693813323975},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.050231486558914185}],"concepts":[{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.7666417360305786},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5717871189117432},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42748284339904785},{"id":"https://openalex.org/C2780336059","wikidata":"https://www.wikidata.org/wiki/Q215558","display_name":"Vocal folds","level":3,"score":0.42315560579299927},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.06452053785324097},{"id":"https://openalex.org/C2780474809","wikidata":"https://www.wikidata.org/wiki/Q9637","display_name":"Larynx","level":2,"score":0.059684693813323975},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.050231486558914185},{"id":"https://openalex.org/C141071460","wikidata":"https://www.wikidata.org/wiki/Q40821","display_name":"Surgery","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iscslp63861.2024.10800185","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscslp63861.2024.10800185","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/5","display_name":"Gender equality","score":0.5}],"awards":[{"id":"https://openalex.org/G1270229601","display_name":null,"funder_award_id":"62171207","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2277195237","https://openalex.org/W2726515241","https://openalex.org/W2964350391","https://openalex.org/W3095545636","https://openalex.org/W3163573274","https://openalex.org/W3209984917","https://openalex.org/W4312933868","https://openalex.org/W4319862477","https://openalex.org/W4372259784","https://openalex.org/W4372263908","https://openalex.org/W4372267276","https://openalex.org/W4375869257","https://openalex.org/W4385823432","https://openalex.org/W4390872325","https://openalex.org/W4391020683","https://openalex.org/W4392902963","https://openalex.org/W6755207826","https://openalex.org/W6763832098","https://openalex.org/W6791353385","https://openalex.org/W6796730497","https://openalex.org/W6840155194","https://openalex.org/W6847363464","https://openalex.org/W6848735303","https://openalex.org/W6849600165","https://openalex.org/W6855375322","https://openalex.org/W6856281210","https://openalex.org/W6857022241","https://openalex.org/W6858516312","https://openalex.org/W6859908464","https://openalex.org/W6862012328"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W4388870064","https://openalex.org/W2210139803","https://openalex.org/W4235186151","https://openalex.org/W2054685365","https://openalex.org/W2056057048","https://openalex.org/W2667588871","https://openalex.org/W2272354214"],"abstract_inverted_index":{"Generating":[0],"novel":[1],"voices":[2,15,123],"in":[3,19,28],"speech":[4,40,113,135,155],"synthesis":[5],"is":[6,31,157],"a":[7,77,89,93,105],"challenging":[8],"task":[9],"with":[10],"potential":[11,132],"for":[12,38,71,115,140],"creating":[13],"versatile":[14],"that":[16,121],"are":[17],"needed":[18],"entertainment":[20],"and":[21,92,154],"research.":[22],"One":[23],"of":[24,34,80],"the":[25,32,99,122,126,141,151],"primary":[26],"obstacles":[27],"this":[29,47],"area":[30],"lack":[33],"well-annotated":[35],"voice":[36,73,116],"descriptions":[37],"expressive":[39],"corpora.":[41],"Our":[42],"research":[43],"aims":[44],"to":[45,66,97,108,112],"address":[46],"issue":[48],"by":[49],"representing":[50],"speaker":[51],"styles":[52],"from":[53,125],"vision.":[54],"We":[55],"introduce":[56],"Stable":[57,64,94],"Diffusion-Enhanced":[58],"Voice":[59],"Generation":[60],"(SD-EVG),":[61],"which":[62],"leverages":[63],"Diffusion":[65,95],"generate":[67],"imaginary":[68],"facial":[69,81,110,128],"images":[70,82],"new":[72],"generation.":[74,117],"To":[75],"create":[76],"reference":[78],"set":[79],"based":[83],"on":[84],"realistic":[85],"voices,":[86],"SD-EVG":[87,103],"employs":[88],"transformer":[90],"encoder":[91],"decoder":[96],"visualize":[98],"speaker's":[100],"face.":[101],"Subsequently,":[102],"uses":[104],"KNN-based":[106],"approach":[107],"map":[109],"features":[111],"style":[114,136],"The":[118],"experiments":[119],"demonstrate":[120],"generated":[124,152],"imagined":[127],"data":[129],"have":[130],"better":[131],"at":[133,159],"capturing":[134],"than":[137],"text-based":[138],"methods":[139],"same":[142],"descriptions.<sup":[143],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[144,146],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup><sup":[145],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>A":[147],"demo":[148],"website":[149],"featuring":[150],"face":[153],"utterances":[156],"available":[158],"https://sd-evg.github.io.":[160]},"counts_by_year":[],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
