{"id":"https://openalex.org/W4408352256","doi":"https://doi.org/10.1109/icassp49660.2025.10887982","title":"Diffused Poses and Distilled Expressions for Controllable Audio-driven Talking Face Generation","display_name":"Diffused Poses and Distilled Expressions for Controllable Audio-driven Talking Face Generation","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408352256","doi":"https://doi.org/10.1109/icassp49660.2025.10887982"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10887982","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10887982","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089387872","display_name":"Ziqi Zhou","orcid":"https://orcid.org/0000-0003-0427-7819"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziqi Zhou","raw_affiliation_strings":["CAS University of Chinese Academy of Sciences,MAIS, Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CAS University of Chinese Academy of Sciences,MAIS, Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045242540","display_name":"Weize Quan","orcid":"https://orcid.org/0000-0003-0892-581X"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weize Quan","raw_affiliation_strings":["CAS University of Chinese Academy of Sciences,MAIS, Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CAS University of Chinese Academy of Sciences,MAIS, Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100388707","display_name":"Zuhong Lu","orcid":"https://orcid.org/0000-0003-3332-2615"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhaojin Lu","raw_affiliation_strings":["Tellhow Group Co., LTD,Jiangxi Tellhow Animation College,Nanchang,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tellhow Group Co., LTD,Jiangxi Tellhow Animation College,Nanchang,China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003969300","display_name":"Dong\u2010Ming Yan","orcid":"https://orcid.org/0000-0003-2209-2404"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dong-Ming Yan","raw_affiliation_strings":["CAS University of Chinese Academy of Sciences,MAIS, Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CAS University of Chinese Academy of Sciences,MAIS, Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210165038"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.02900611,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9807000160217285,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9323999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.6901695132255554},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6386207342147827},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5349145531654358},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3233809471130371}],"concepts":[{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.6901695132255554},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6386207342147827},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5349145531654358},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3233809471130371},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10887982","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10887982","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/5","score":0.44999998807907104,"display_name":"Gender equality"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2133665775","https://openalex.org/W2593414223","https://openalex.org/W2726515241","https://openalex.org/W2963073614","https://openalex.org/W2964449965","https://openalex.org/W2964559396","https://openalex.org/W3019952993","https://openalex.org/W3081492798","https://openalex.org/W3180794345","https://openalex.org/W3186090335","https://openalex.org/W3187364420","https://openalex.org/W3197199219","https://openalex.org/W4382469130","https://openalex.org/W4386072021","https://openalex.org/W4386075576","https://openalex.org/W4388460040","https://openalex.org/W4390874567","https://openalex.org/W4404199654","https://openalex.org/W6637373629","https://openalex.org/W6737896281","https://openalex.org/W6765657114","https://openalex.org/W6765779288","https://openalex.org/W6859319753","https://openalex.org/W6864518445"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Audio-driven":[0],"portrait":[1,113],"animation":[2,114,158],"is":[3],"an":[4,134,156],"emerging":[5],"field":[6],"in":[7,99,187,199],"multi-modal":[8],"generation":[9,121],"that":[10,115,160,179],"aims":[11],"to":[12,78,167],"create":[13],"lifelike":[14],"talking":[15,112,173],"face":[16],"videos":[17],"from":[18],"audio":[19,32,62],"input.":[20],"While":[21],"significant":[22],"progress":[23],"has":[24],"been":[25],"made,":[26],"accurately":[27],"modeling":[28],"the":[29,58,94,200],"relationship":[30],"between":[31,61],"signals":[33],"and":[34,42,63,70,119,125,143,165,171,190],"various":[35],"facial":[36,65,87,126,151,192],"motions,":[37,88],"such":[38],"as":[39],"head":[40,68,123,141,174],"poses":[41,124,142,164],"expressions,":[43],"remains":[44],"a":[45,107,144],"challenge.":[46],"Existing":[47],"methods":[48],"have":[49,76],"primarily":[50],"focused":[51],"on":[52],"generating":[53,139,188],"lip-synchronized":[54],"movements,":[55],"often":[56,91],"neglecting":[57],"intricate":[59],"correlations":[60],"other":[64],"dynamics":[66],"like":[67],"movements":[69],"eye":[71],"blinks.":[72],"More":[73],"recent":[74],"approaches":[75],"attempted":[77],"address":[79],"these":[80,162],"limitations":[81],"by":[82],"introducing":[83],"latent":[84],"disentanglement":[85],"of":[86,96,122],"though":[89],"this":[90,103],"comes":[92],"at":[93],"cost":[95],"reduced":[97],"flexibility":[98],"motion":[100],"control.":[101],"In":[102],"work,":[104],"we":[105],"propose":[106],"novel":[108],"framework":[109],"for":[110,117,138,148],"audio-driven":[111],"allows":[116],"precise":[118],"controllable":[120,172],"expressions.":[127,152],"Our":[128],"approach":[129],"includes":[130],"two":[131],"key":[132],"components:":[133],"audio-conditional":[135],"diffusion":[136],"model":[137,159],"prosody-aware":[140],"noise-conditional,":[145],"lip-distilling":[146],"transformer":[147],"predicting":[149],"synchronized":[150,191],"We":[153],"further":[154],"introduce":[155],"innovative":[157],"uses":[161],"generated":[163],"expressions":[166],"produce":[168],"highly":[169],"realistic":[170],"videos.":[175],"Extensive":[176],"experiments":[177],"demonstrate":[178],"our":[180],"method":[181],"not":[182],"only":[183],"achieves":[184],"superior":[185],"performance":[186],"natural":[189],"motions":[193],"but":[194],"also":[195],"outperforms":[196],"state-of-the-art":[197],"techniques":[198],"field.":[201]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
