{"id":"https://openalex.org/W7138277466","doi":"https://doi.org/10.1609/aaai.v40i12.37940","title":"READ: Real-time and Efficient Asynchronous Diffusion for Audio-driven Talking Head Generation","display_name":"READ: Real-time and Efficient Asynchronous Diffusion for Audio-driven Talking Head Generation","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138277466","doi":"https://doi.org/10.1609/aaai.v40i12.37940"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i12.37940","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i12.37940","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37940/41902","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37940/41902","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129727032","display_name":"Haotian Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Haotian Wang","raw_affiliation_strings":["University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013739157","display_name":"Yuzhe Weng","orcid":"https://orcid.org/0000-0002-1197-2513"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuzhe Weng","raw_affiliation_strings":["University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129691252","display_name":"Jun Du","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Du","raw_affiliation_strings":["University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129662015","display_name":"Haoran Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haoran Xu","raw_affiliation_strings":["iFLYTEK"],"affiliations":[{"raw_affiliation_string":"iFLYTEK","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129731140","display_name":"Xiaoyan Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaoyan Wu","raw_affiliation_strings":["iFLYTEK"],"affiliations":[{"raw_affiliation_string":"iFLYTEK","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015082268","display_name":"Shan He","orcid":"https://orcid.org/0000-0002-7982-9625"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan He","raw_affiliation_strings":["iFLYTEK"],"affiliations":[{"raw_affiliation_string":"iFLYTEK","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129719849","display_name":"Bing Yin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bing Yin","raw_affiliation_strings":["iFLYTEK"],"affiliations":[{"raw_affiliation_string":"iFLYTEK","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129730894","display_name":"Cong Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cong Liu","raw_affiliation_strings":["iFLYTEK"],"affiliations":[{"raw_affiliation_string":"iFLYTEK","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129669846","display_name":"Jianqing Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jianqing Gao","raw_affiliation_strings":["iFLYTEK"],"affiliations":[{"raw_affiliation_string":"iFLYTEK","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129733293","display_name":"Qingfeng Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qingfeng Liu","raw_affiliation_strings":["University of Science and Technology of China\niFLYTEK"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China\niFLYTEK","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5129727032"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.62910448,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"12","first_page":"9766","last_page":"9774"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9010000228881836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9010000228881836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.05389999970793724,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.006300000008195639,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.6758999824523926},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5472999811172485},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5087000131607056},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.3476000130176544},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.3472000062465668},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.33660000562667847}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8054999709129333},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.6758999824523926},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5472999811172485},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5087000131607056},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.41999998688697815},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39489999413490295},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.3476000130176544},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.3472000062465668},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.33660000562667847},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.32179999351501465},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.30720001459121704},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.29280000925064087},{"id":"https://openalex.org/C19275194","wikidata":"https://www.wikidata.org/wiki/Q222903","display_name":"Multiplexing","level":2,"score":0.29120001196861267},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2881999909877777},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2700999975204468},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2621000111103058}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i12.37940","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i12.37940","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37940/41902","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i12.37940","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i12.37940","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37940/41902","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138277466.pdf","grobid_xml":"https://content.openalex.org/works/W7138277466.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,149],"introduction":[1],"of":[2,12,28,146],"diffusion":[3],"models":[4],"has":[5],"brought":[6],"significant":[7],"advances":[8],"to":[9,67,87,95,121],"the":[10,18,25,64,96,141,159],"field":[11],"audio-driven":[13],"talking":[14,30,43,117,179],"head":[15,31,44,118,180],"generation.":[16,69,201],"However,":[17],"extremely":[19],"slow":[20],"inference":[21,127,144],"speed":[22,193],"severely":[23],"limits":[24],"practical":[26],"implementation":[27],"diffusion-based":[29],"generation":[32,45,157],"models.":[33],"In":[34],"this":[35,76],"study,":[36],"we":[37,131],"propose":[38,132],"READ,":[39],"a":[40,51,59,80,107,133],"real-time":[41],"diffusion-transformer-based":[42],"framework.":[46,148],"Our":[47],"approach":[48],"first":[49],"learns":[50],"spatiotemporal":[52],"highly":[53],"compressed":[54,77,90],"video":[55,97,166],"latent":[56,78,92,98,101,160],"space":[57],"via":[58],"temporal":[60,123],"VAE,":[61],"significantly":[62,183],"reducing":[63],"token":[65],"count":[66],"accelerate":[68],"To":[70],"achieve":[71],"better":[72],"audio-visual":[73],"alignment":[74],"within":[75],"space,":[79,161],"pre-trained":[81],"Speech":[82],"Autoencoder":[83],"(SpeechAE)":[84],"is":[85],"proposed":[86],"generate":[88],"temporally":[89],"speech":[91],"codes":[93],"corresponding":[94],"space.":[99],"These":[100],"representations":[102],"are":[103],"then":[104],"modeled":[105],"by":[106,176],"carefully":[108],"designed":[109],"Audio-to-Video":[110],"Diffusion":[111],"Transformer":[112],"(A2V-DiT)":[113],"backbone":[114],"for":[115,139],"efficient":[116],"synthesis.":[119],"Furthermore,":[120],"ensure":[122],"consistency":[124,163],"and":[125,143,154,192],"accelerated":[126],"in":[128,158,164,199],"extended":[129],"generation,":[130],"novel":[134],"asynchronous":[135,152,155],"noise":[136],"scheduler":[137],"(ANS)":[138],"both":[140],"training":[142],"processes":[145],"our":[147],"ANS":[150],"leverages":[151],"add-noise":[153],"motion-guided":[156],"ensuring":[162],"generated":[165],"clips.":[167],"Experimental":[168],"results":[169],"demonstrate":[170],"that":[171],"READ":[172],"outperforms":[173],"state-of-the-art":[174],"methods":[175],"generating":[177],"competitive":[178],"videos":[181],"with":[182],"reduced":[184],"runtime,":[185],"achieving":[186],"an":[187],"optimal":[188],"balance":[189],"between":[190],"quality":[191],"while":[194],"maintaining":[195],"robust":[196],"metric":[197],"stability":[198],"long-time":[200]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
