{"id":"https://openalex.org/W4411631939","doi":"https://doi.org/10.1145/3731715.3733322","title":"EmoHuman: Fine-Grained Emotion-Controlled Talking Head Generation via Audio-Text Multimodal Detangling","display_name":"EmoHuman: Fine-Grained Emotion-Controlled Talking Head Generation via Audio-Text Multimodal Detangling","publication_year":2025,"publication_date":"2025-06-25","ids":{"openalex":"https://openalex.org/W4411631939","doi":"https://doi.org/10.1145/3731715.3733322"},"language":"en","primary_location":{"id":"doi:10.1145/3731715.3733322","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3731715.3733322","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088926630","display_name":"Qifeng Dai","orcid":"https://orcid.org/0009-0002-2071-8506"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qifeng Dai","raw_affiliation_strings":["Xiamen University, Xiamen, Fujian, China"],"raw_orcid":"https://orcid.org/0009-0002-2071-8506","affiliations":[{"raw_affiliation_string":"Xiamen University, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110201434","display_name":"Hua Feng","orcid":"https://orcid.org/0009-0001-0889-0584"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huidong Feng","raw_affiliation_strings":["Xiamen University, Xiamen, Fujian, China"],"raw_orcid":"https://orcid.org/0009-0001-0889-0584","affiliations":[{"raw_affiliation_string":"Xiamen University, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Wendi Cui","orcid":"https://orcid.org/0009-0008-0286-4634"},"institutions":[{"id":"https://openalex.org/I180662265","display_name":"China Mobile (China)","ror":"https://ror.org/05gftfe97","country_code":"CN","type":"company","lineage":["https://openalex.org/I180662265"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wendi Cui","raw_affiliation_strings":["China Mobile Communications Corporation, Xiamen, Fujian, China"],"raw_orcid":"https://orcid.org/0009-0008-0286-4634","affiliations":[{"raw_affiliation_string":"China Mobile Communications Corporation, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I180662265"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067816878","display_name":"Xinqi Cai","orcid":null},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinqi Cai","raw_affiliation_strings":["Xiamen University, Xiamen, Fujian, China"],"raw_orcid":"https://orcid.org/0009-0002-9607-1040","affiliations":[{"raw_affiliation_string":"Xiamen University, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011936767","display_name":"Yinglin Zheng","orcid":"https://orcid.org/0000-0003-4671-6111"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yinglin Zheng","raw_affiliation_strings":["Xiamen University, Xiamen, Fujian, China"],"raw_orcid":"https://orcid.org/0000-0003-4671-6111","affiliations":[{"raw_affiliation_string":"Xiamen University, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100447274","display_name":"Ming Zeng","orcid":"https://orcid.org/0000-0002-5056-0706"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ming Zeng","raw_affiliation_strings":["Xiamen University, Xiamen, Fujian, China"],"raw_orcid":"https://orcid.org/0000-0002-5056-0706","affiliations":[{"raw_affiliation_string":"Xiamen University, Xiamen, Fujian, China","institution_ids":["https://openalex.org/I191208505"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5088926630"],"corresponding_institution_ids":["https://openalex.org/I191208505"],"apc_list":null,"apc_paid":null,"fwci":1.8526,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.85234188,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"145","last_page":"154"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7628539204597473},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5517221093177795},{"id":"https://openalex.org/keywords/head","display_name":"Head (geology)","score":0.5036329627037048},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.43651413917541504},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.41230830550193787}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7628539204597473},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5517221093177795},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.5036329627037048},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.43651413917541504},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.41230830550193787},{"id":"https://openalex.org/C114793014","wikidata":"https://www.wikidata.org/wiki/Q52109","display_name":"Geomorphology","level":1,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3731715.3733322","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3731715.3733322","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W2030931454","https://openalex.org/W2604379605","https://openalex.org/W2803193013","https://openalex.org/W2965813579","https://openalex.org/W2990138404","https://openalex.org/W3019952993","https://openalex.org/W3081492798","https://openalex.org/W3101631197","https://openalex.org/W3109114891","https://openalex.org/W3174763799","https://openalex.org/W3197199219","https://openalex.org/W3211147706","https://openalex.org/W4281730245","https://openalex.org/W4310379947","https://openalex.org/W4310695675","https://openalex.org/W4312722235","https://openalex.org/W4312933868","https://openalex.org/W4382240211","https://openalex.org/W4385490328","https://openalex.org/W4385822407","https://openalex.org/W4386072021","https://openalex.org/W4386075487","https://openalex.org/W4390872297","https://openalex.org/W4390872334","https://openalex.org/W4390872742","https://openalex.org/W4390872769","https://openalex.org/W4392902646","https://openalex.org/W4394597549","https://openalex.org/W4403081627","https://openalex.org/W4404199654","https://openalex.org/W6600540558","https://openalex.org/W6601158201","https://openalex.org/W6810793953","https://openalex.org/W6834284007"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Audio-driven":[0],"talking":[1,51],"head":[2,52],"generation":[3],"has":[4],"made":[5],"significant":[6],"strides":[7],"in":[8,116,129],"creating":[9],"realistic":[10,118],"and":[11,67,77,106,131,136],"lip-synchronized":[12],"portraits.":[13],"However,":[14],"most":[15],"existing":[16],"approaches":[17],"overlook":[18],"facial":[19,28,61,113],"expressions,":[20,114],"with":[21],"only":[22],"a":[23,95],"few":[24],"attempting":[25],"to":[26,33,93],"model":[27],"emotions":[29],"explicitly,":[30],"often":[31],"leading":[32],"unnatural":[34],"results.":[35],"To":[36],"address":[37],"this":[38],"gap,":[39],"we":[40],"introduce":[41],"EmoHuman,":[42],"an":[43,83],"audio-to-video":[44],"synthesis":[45],"method":[46],"that":[47,124],"generates":[48],"emotionally":[49],"nuanced":[50],"videos":[53],"without":[54],"relying":[55],"on":[56],"intermediate":[57],"3D":[58],"representations":[59],"or":[60],"landmarks.":[62],"EmoHuman":[63,125],"decouples":[64],"content,":[65],"emotion,":[66],"emotional":[68,107],"intensity":[69,108],"from":[70],"the":[71,75,78,110],"multimodal":[72],"information":[73],"of":[74,112],"audio":[76],"corresponding":[79],"textual":[80],"content":[81,89],"through":[82],"Audio":[84],"Emotion":[85],"Decoupling":[86],"Module.":[87],"The":[88],"features":[90],"are":[91],"used":[92],"drive":[94],"powerful":[96],"video":[97,119,132],"diffusion":[98],"model,":[99],"generating":[100],"synchronized":[101],"lip":[102],"movements,":[103],"while":[104],"emotion":[105],"govern":[109],"simulation":[111],"resulting":[115],"more":[117],"outputs.":[120],"Extensive":[121],"experiments":[122],"demonstrate":[123],"outperforms":[126],"state-of-the-art":[127],"methods":[128],"image":[130],"quality,":[133],"expression":[134],"correlation,":[135],"lip-synchronization":[137],"accuracy.":[138]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
