{"id":"https://openalex.org/W7137938543","doi":"https://doi.org/10.1609/aaai.v40i14.38162","title":"StreamingTalker: Audio-driven 3D Facial Animation with Autoregressive Diffusion Model","display_name":"StreamingTalker: Audio-driven 3D Facial Animation with Autoregressive Diffusion Model","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7137938543","doi":"https://doi.org/10.1609/aaai.v40i14.38162"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i14.38162","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i14.38162","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i14.38162","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129710858","display_name":"Yifan Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210156066","display_name":"State Key Laboratory of Chemical Engineering","ror":"https://ror.org/05803vc71","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210156066"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yifan Yang","raw_affiliation_strings":["State Key Laboratory of CAD&CG, Zhejiang University\nAnt Group"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of CAD&CG, Zhejiang University\nAnt Group","institution_ids":["https://openalex.org/I4210156066"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120699694","display_name":"Zhi Cen","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhi Cen","raw_affiliation_strings":["State Key Laboratory of CAD&CG, Zhejiang University"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of CAD&CG, Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073087748","display_name":"Sida Peng","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sida Peng","raw_affiliation_strings":["State Key Laboratory of CAD&CG, Zhejiang University"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of CAD&CG, Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067006037","display_name":"Xiangwei Chen","orcid":"https://orcid.org/0000-0002-6824-7486"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiangwei Chen","raw_affiliation_strings":["College of Computer Science, Zhejiang University"],"affiliations":[{"raw_affiliation_string":"College of Computer Science, Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129731203","display_name":"Yifu Deng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yifu Deng","raw_affiliation_strings":["Ant Group"],"affiliations":[{"raw_affiliation_string":"Ant Group","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129662334","display_name":"Xinyu Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xinyu Zhu","raw_affiliation_strings":["Ant Group"],"affiliations":[{"raw_affiliation_string":"Ant Group","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129732715","display_name":"Fan Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan Jia","raw_affiliation_strings":["Ant Group"],"affiliations":[{"raw_affiliation_string":"Ant Group","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127782873","display_name":"Xiaowei Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaowei Zhou","raw_affiliation_strings":["State Key Laboratory of CAD&CG, Zhejiang University"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of CAD&CG, Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129744454","display_name":"Hujun Bao","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hujun Bao","raw_affiliation_strings":["State Key Laboratory of CAD&CG, Zhejiang University"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of CAD&CG, Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5129710858"],"corresponding_institution_ids":["https://openalex.org/I4210156066"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18731343,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"14","first_page":"11766","last_page":"11774"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9233999848365784,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9233999848365784,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.051500000059604645,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.0035000001080334187,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.6654999852180481},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.5281999707221985},{"id":"https://openalex.org/keywords/computer-facial-animation","display_name":"Computer facial animation","score":0.5273000001907349},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4837000072002411},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.41920000314712524},{"id":"https://openalex.org/keywords/animation","display_name":"Animation","score":0.41670000553131104},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.4075999855995178},{"id":"https://openalex.org/keywords/flexibility","display_name":"Flexibility (engineering)","score":0.3968000113964081}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8030999898910522},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.6654999852180481},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5785999894142151},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.5281999707221985},{"id":"https://openalex.org/C138591656","wikidata":"https://www.wikidata.org/wiki/Q5157538","display_name":"Computer facial animation","level":4,"score":0.5273000001907349},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5081999897956848},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4837000072002411},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.41920000314712524},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.41670000553131104},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.4075999855995178},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.3968000113964081},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.37630000710487366},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3702999949455261},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3203999996185303},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.31700000166893005},{"id":"https://openalex.org/C69369342","wikidata":"https://www.wikidata.org/wiki/Q1401416","display_name":"Computer animation","level":3,"score":0.31679999828338623},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.31310001015663147},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.30219998955726624},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.29089999198913574},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.28049999475479126},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C98907195","wikidata":"https://www.wikidata.org/wiki/Q5428562","display_name":"Facial motion capture","level":5,"score":0.26409998536109924}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i14.38162","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i14.38162","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i14.38162","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i14.38162","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0,105,146],"paper":[1],"focuses":[2],"on":[3,167],"the":[4,48,72,138],"task":[5],"of":[6,118,127],"speech-driven":[7],"3D":[8,32],"facial":[9,18,33,99,156],"animation,":[10,34],"which":[11,56],"aims":[12],"to":[13,63,141,153],"generate":[14,155],"realistic":[15],"and":[16,41,75,113,134],"synchronized":[17],"motions":[19,100],"driven":[20],"by":[21],"speech":[22],"inputs.":[23,85],"Recent":[24],"methods":[25,46],"have":[26],"employed":[27],"audio-conditioned":[28],"diffusion":[29,95,151],"models":[30],"for":[31],"achieving":[35],"impressive":[36],"results":[37],"in":[38,52,101],"generating":[39],"expressive":[40],"natural":[42],"animations.":[43],"However,":[44],"these":[45,88],"process":[47],"whole":[49],"audio":[50,68,84,111,119,139],"sequences":[51,69],"a":[53,92,102,124,143,149],"single":[54],"pass,":[55],"poses":[57],"two":[58],"major":[59],"challenges:":[60],"they":[61],"tend":[62],"perform":[64],"poorly":[65],"when":[66,81],"handling":[67],"that":[70,97,171],"exceed":[71],"training":[73],"horizon":[74],"will":[76],"suffer":[77],"from":[78],"significant":[79],"latency":[80,116],"processing":[82],"long":[83],"To":[86],"address":[87],"limitations,":[89],"we":[90,122],"propose":[91],"novel":[93],"autoregressive":[94],"model":[96],"outputs":[98],"streaming":[103],"manner.":[104],"design":[106],"ensures":[107],"flexibility":[108],"with":[109,137,162],"varying":[110],"lengths":[112],"achieves":[114],"low":[115],"independent":[117],"duration.":[120],"Specifically,":[121],"select":[123],"limited":[125],"number":[126],"past":[128],"frames":[129],"as":[130],"historical":[131],"motion":[132,157],"context":[133],"combine":[135],"them":[136],"input":[140],"create":[142],"dynamic":[144],"condition.":[145],"condition":[147],"guides":[148],"lightweight":[150],"head":[152],"iteratively":[154],"frames,":[158],"enabling":[159],"real-time":[160],"synthesis":[161],"high-quality":[163],"results.":[164],"Experiments":[165],"conducted":[166],"public":[168],"datasets":[169],"demonstrate":[170],"our":[172],"approach":[173],"outperforms":[174],"recent":[175],"baseline":[176],"methods.":[177]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
