{"id":"https://openalex.org/W4403791980","doi":"https://doi.org/10.1145/3664647.3680619","title":"ConsistentAvatar: Learning to Diffuse Fully Consistent Talking Head Avatar with Temporal Guidance","display_name":"ConsistentAvatar: Learning to Diffuse Fully Consistent Talking Head Avatar with Temporal Guidance","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403791980","doi":"https://doi.org/10.1145/3664647.3680619"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3680619","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3680619","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072993313","display_name":"Haijie Yang","orcid":"https://orcid.org/0000-0003-3489-0438"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Haijie Yang","raw_affiliation_strings":["Nanjing University of Science and Technology, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100389499","display_name":"Zhenyu Zhang","orcid":"https://orcid.org/0000-0001-5727-9450"},"institutions":[{"id":"https://openalex.org/I3923682","display_name":"Soochow University","ror":"https://ror.org/05t8y2r12","country_code":"CN","type":"education","lineage":["https://openalex.org/I3923682"]},{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenyu Zhang","raw_affiliation_strings":["Nanjing University, Suzhou, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University, Suzhou, China","institution_ids":["https://openalex.org/I3923682","https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050748634","display_name":"Hao Tang","orcid":"https://orcid.org/0000-0002-2077-1246"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Tang","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064363522","display_name":"Jianjun Qian","orcid":"https://orcid.org/0000-0002-0968-8556"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianjun Qian","raw_affiliation_strings":["Nanjing University of Science and Techonology, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Science and Techonology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100726984","display_name":"Jian Yang","orcid":"https://orcid.org/0000-0003-4800-832X"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian Yang","raw_affiliation_strings":["Nanjing University of Science and Technology, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5072993313"],"corresponding_institution_ids":["https://openalex.org/I36399199"],"apc_list":null,"apc_paid":null,"fwci":0.7424,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.72946276,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"3964","last_page":"3973"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/avatar","display_name":"Avatar","score":0.8992769718170166},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7157903909683228},{"id":"https://openalex.org/keywords/head","display_name":"Head (geology)","score":0.6554396152496338},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.49366840720176697},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.40404948592185974},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.39010298252105713},{"id":"https://openalex.org/keywords/geology","display_name":"Geology","score":0.08772090077400208}],"concepts":[{"id":"https://openalex.org/C2777365542","wikidata":"https://www.wikidata.org/wiki/Q83090","display_name":"Avatar","level":2,"score":0.8992769718170166},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7157903909683228},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.6554396152496338},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.49366840720176697},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40404948592185974},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.39010298252105713},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.08772090077400208},{"id":"https://openalex.org/C114793014","wikidata":"https://www.wikidata.org/wiki/Q52109","display_name":"Geomorphology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664647.3680619","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3680619","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W2237250383","https://openalex.org/W2769666294","https://openalex.org/W3014859719","https://openalex.org/W3182714435","https://openalex.org/W3204680331","https://openalex.org/W3207250917","https://openalex.org/W4281779270","https://openalex.org/W4312233178","https://openalex.org/W4312364013","https://openalex.org/W4312578889","https://openalex.org/W4312607553","https://openalex.org/W4312918802","https://openalex.org/W4312933868","https://openalex.org/W4368755870","https://openalex.org/W4386066263","https://openalex.org/W4386072021","https://openalex.org/W4386075487","https://openalex.org/W4386075804","https://openalex.org/W4386076654","https://openalex.org/W4390872742","https://openalex.org/W4390873054","https://openalex.org/W4402727830","https://openalex.org/W4402753521"],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Diffusion":[0],"models":[1],"have":[2],"shown":[3],"impressive":[4],"potential":[5],"on":[6,136,186,198],"talking":[7,14,54,166],"head":[8,141],"generation.":[9,56],"While":[10],"plausible":[11],"appearance":[12],"and":[13,33,52,92,143,204],"effect":[15],"are":[16],"achieved,":[17],"these":[18],"methods":[19,197],"still":[20],"suffer":[21],"from":[22],"temporal,":[23],"3D":[24],"or":[25],"expression":[26,203],"inconsistency":[27],"due":[28],"to":[29,63,70,109,116,162],"the":[30,64,73,98,113,119,137,150,155,159,173,179,184,195,199],"error":[31,181],"accumulation":[32],"inherent":[34],"limitation":[35],"of":[36,58,112,118,175],"single-image":[37],"generation":[38],"ability.":[39],"In":[40],"this":[41],"paper,":[42],"we":[43,82,107],"propose":[44,83],"ConsistentAvatar,":[45],"a":[46,84,102,130],"novel":[47],"framework":[48],"for":[49,76],"fully":[50,131],"consistent":[51,104,132],"high-fidelity":[53],"avatar":[55,126],"Instead":[57],"directly":[59],"employing":[60],"multi-modal":[61],"conditions":[62],"diffusion":[65,105,133,160],"process,":[66],"our":[67],"method":[68],"learns":[69],"first":[71],"model":[72],"temporal":[74,103,156,205],"representation":[75],"stability":[77],"between":[78],"adjacent":[79],"frames.":[80],"Specifically,":[81],"Temporally-Sensitive":[85],"Detail":[86],"(TSD)":[87],"map":[88],"containing":[89],"high-frequency":[90],"feature":[91],"contours":[93],"that":[94,117,149,192],"vary":[95],"significantly":[96],"along":[97],"time":[99],"axis.":[100],"Using":[101],"module,":[106,134],"learn":[108],"align":[110],"TSD":[111],"initial":[114],"result":[115],"video":[120],"frame":[121],"ground":[122],"truth.":[123],"The":[124],"final":[125],"is":[127],"generated":[128,200],"by":[129],"conditioned":[135],"aligned":[138,151],"TSD,":[139,152],"rough":[140],"normal,":[142],"emotion":[144],"prompt":[145],"embedding.":[146],"We":[147],"find":[148],"which":[153],"represents":[154],"patterns,":[157],"constrains":[158],"process":[161],"generate":[163],"temporally":[164],"stable":[165],"head.":[167],"Further,":[168],"its":[169],"reliable":[170],"guidance":[171],"complements":[172],"inaccuracy":[174],"other":[176],"conditions,":[177],"suppressing":[178],"accumulated":[180],"while":[182],"improving":[183],"consistency":[185],"various":[187],"aspects.":[188],"Extensive":[189],"experiments":[190],"demonstrate":[191],"ConsistentAvatar":[193],"outperforms":[194],"state-of-the-art":[196],"appearance,":[201],"3D,":[202],"consistency.":[206]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-02T15:55:50.835912","created_date":"2025-10-10T00:00:00"}
