{"id":"https://openalex.org/W4402980089","doi":"https://doi.org/10.1109/icme57554.2024.10687417","title":"Voice-to-Face Generation: Couple of Self-Supervised Representation Learning with Diffusion Model","display_name":"Voice-to-Face Generation: Couple of Self-Supervised Representation Learning with Diffusion Model","publication_year":2024,"publication_date":"2024-07-15","ids":{"openalex":"https://openalex.org/W4402980089","doi":"https://doi.org/10.1109/icme57554.2024.10687417"},"language":"en","primary_location":{"id":"doi:10.1109/icme57554.2024.10687417","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme57554.2024.10687417","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013562664","display_name":"Wuyang Chen","orcid":"https://orcid.org/0000-0002-7746-4191"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wuyang Chen","raw_affiliation_strings":["National University of Defense Technology,Changsha,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013340793","display_name":"Kele Xu","orcid":"https://orcid.org/0000-0001-5997-5169"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kele Xu","raw_affiliation_strings":["National University of Defense Technology,Changsha,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051680867","display_name":"Yong Dou","orcid":"https://orcid.org/0000-0002-1256-8934"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yong Dou","raw_affiliation_strings":["National University of Defense Technology,Changsha,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,Changsha,China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5011666324","display_name":"Tian Gao","orcid":"https://orcid.org/0000-0002-1646-5430"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian Gao","raw_affiliation_strings":["iFlytek Research,Hefei,China"],"affiliations":[{"raw_affiliation_string":"iFlytek Research,Hefei,China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5013562664"],"corresponding_institution_ids":["https://openalex.org/I170215575"],"apc_list":null,"apc_paid":null,"fwci":0.3626,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.6727748,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.498199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.498199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14222","display_name":"Knowledge Management and Technology","score":0.4291999936103821,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7101911306381226},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.6436530947685242},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6407912969589233},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5419569611549377},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4587779939174652},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.45229873061180115},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.05822613835334778}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7101911306381226},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.6436530947685242},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6407912969589233},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5419569611549377},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4587779939174652},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.45229873061180115},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.05822613835334778},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme57554.2024.10687417","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme57554.2024.10687417","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1781854261","https://openalex.org/W2057179155","https://openalex.org/W2066078528","https://openalex.org/W2077881910","https://openalex.org/W2138719476","https://openalex.org/W2139749422","https://openalex.org/W2166563626","https://openalex.org/W2726515241","https://openalex.org/W2897663715","https://openalex.org/W2962785568","https://openalex.org/W2962918445","https://openalex.org/W2963801643","https://openalex.org/W2979157532","https://openalex.org/W3154848313","https://openalex.org/W3162846947","https://openalex.org/W3175300676","https://openalex.org/W3176445421","https://openalex.org/W3177326298","https://openalex.org/W3215180973","https://openalex.org/W4225760288","https://openalex.org/W4284898017","https://openalex.org/W4285603046","https://openalex.org/W4312282373","https://openalex.org/W4312372834","https://openalex.org/W4312722235","https://openalex.org/W4312933868","https://openalex.org/W4372341093","https://openalex.org/W4385822589","https://openalex.org/W4386071707","https://openalex.org/W4386076291","https://openalex.org/W4386076428","https://openalex.org/W4390871829","https://openalex.org/W4390872674","https://openalex.org/W4390874283"],"related_works":["https://openalex.org/W2062195135","https://openalex.org/W2795079307","https://openalex.org/W2793058541","https://openalex.org/W1983629434","https://openalex.org/W2055929693","https://openalex.org/W4324271173","https://openalex.org/W1967645776","https://openalex.org/W2352227742","https://openalex.org/W4390679071","https://openalex.org/W3006966347"],"abstract_inverted_index":{"In":[0],"this":[1],"study,":[2],"we":[3,54,91],"explore":[4],"the":[5,23,47,58,63,73,85,107],"challenging":[6],"task":[7],"of":[8,49],"generating":[9],"facial":[10],"images":[11,135],"from":[12],"unheard":[13],"voices,":[14],"aiming":[15],"to":[16,22,39,112],"synthesize":[17],"similar":[18],"faces":[19],"that":[20,31,53,118],"correspond":[21],"voice":[24,59,68,78,88,101],"identity.":[25],"We":[26],"design":[27],"a":[28,94],"novel":[29],"framework":[30,120],"encompasses":[32],"voice-face":[33,127],"self-supervised":[34],"representation":[35],"learning":[36],"and":[37,89,131],"extends":[38],"voice-based":[40],"face":[41,81],"generation.":[42],"The":[43,99],"key":[44],"idea":[45],"behind":[46],"feasibility":[48],"cross-modal":[50,74],"generation":[51],"is":[52],"not":[55],"only":[56],"enrich":[57],"representations":[60,102],"by":[61],"modeling":[62],"locally":[64],"inherent":[65],"correlations":[66],"within":[67],"data":[69],"but":[70],"also":[71],"establish":[72],"connections":[75],"through":[76],"aligning":[77],"with":[79],"paired":[80],"data.":[82],"To":[83],"enhance":[84],"association":[86,128],"between":[87],"face,":[90],"further":[92],"promote":[93],"false":[95],"negative":[96],"mitigation":[97],"method.":[98],"learned":[100],"are":[103],"then":[104],"fed":[105],"into":[106],"diffusion":[108],"model":[109],"using":[110],"cross-attention":[111],"produce":[113],"an":[114],"image.":[115],"Experiments":[116],"show":[117],"our":[119],"outperforms":[121],"previous":[122],"state-of-the-art":[123],"methods":[124],"on":[125],"various":[126],"evaluation":[129],"tasks":[130],"yields":[132],"substantially":[133],"better":[134],"than":[136],"prior":[137],"approaches.":[138]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-27T23:08:20.325037","created_date":"2025-10-10T00:00:00"}
