{"id":"https://openalex.org/W4414170463","doi":"https://doi.org/10.1109/avss65446.2025.11149961","title":"ATL-Diff: Audio-Driven Talking Head Generation with Early Landmarks-Guide Noise Diffusion","display_name":"ATL-Diff: Audio-Driven Talking Head Generation with Early Landmarks-Guide Noise Diffusion","publication_year":2025,"publication_date":"2025-08-11","ids":{"openalex":"https://openalex.org/W4414170463","doi":"https://doi.org/10.1109/avss65446.2025.11149961"},"language":"en","primary_location":{"id":"doi:10.1109/avss65446.2025.11149961","is_oa":false,"landing_page_url":"https://doi.org/10.1109/avss65446.2025.11149961","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Advanced Visual and Signal-Based Systems (AVSS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016273080","display_name":"Son Vo","orcid":null},"institutions":[{"id":"https://openalex.org/I111277659","display_name":"Chonnam National University","ror":"https://ror.org/05kzjxq56","country_code":"KR","type":"education","lineage":["https://openalex.org/I111277659"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Son Thanh-Hoang Vo","raw_affiliation_strings":["Chonnam National University,Gwangju,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Chonnam National University,Gwangju,Republic of Korea","institution_ids":["https://openalex.org/I111277659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101533886","display_name":"Nguy\u1ec5n Quang V\u1ecbnh","orcid":"https://orcid.org/0000-0003-2584-1466"},"institutions":[{"id":"https://openalex.org/I111277659","display_name":"Chonnam National University","ror":"https://ror.org/05kzjxq56","country_code":"KR","type":"education","lineage":["https://openalex.org/I111277659"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Quang-Vinh Nguyen","raw_affiliation_strings":["Chonnam National University,Gwangju,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Chonnam National University,Gwangju,Republic of Korea","institution_ids":["https://openalex.org/I111277659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108699829","display_name":"Seungwon Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I111277659","display_name":"Chonnam National University","ror":"https://ror.org/05kzjxq56","country_code":"KR","type":"education","lineage":["https://openalex.org/I111277659"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Seungwon Kim","raw_affiliation_strings":["Chonnam National University,Gwangju,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Chonnam National University,Gwangju,Republic of Korea","institution_ids":["https://openalex.org/I111277659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087619194","display_name":"Hyung-Jeong Yang","orcid":"https://orcid.org/0000-0003-3024-5060"},"institutions":[{"id":"https://openalex.org/I111277659","display_name":"Chonnam National University","ror":"https://ror.org/05kzjxq56","country_code":"KR","type":"education","lineage":["https://openalex.org/I111277659"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hyung-Jeong Yang","raw_affiliation_strings":["Chonnam National University,Gwangju,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Chonnam National University,Gwangju,Republic of Korea","institution_ids":["https://openalex.org/I111277659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057089774","display_name":"Soonja Yeom","orcid":"https://orcid.org/0000-0002-5843-101X"},"institutions":[{"id":"https://openalex.org/I129801699","display_name":"University of Tasmania","ror":"https://ror.org/01nfmeh72","country_code":"AU","type":"education","lineage":["https://openalex.org/I129801699"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Soonja Yeom","raw_affiliation_strings":["University of Tasmania,Hobart,Australia"],"affiliations":[{"raw_affiliation_string":"University of Tasmania,Hobart,Australia","institution_ids":["https://openalex.org/I129801699"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100605822","display_name":"Soo-Hyung Kim","orcid":"https://orcid.org/0000-0003-3575-5035"},"institutions":[{"id":"https://openalex.org/I111277659","display_name":"Chonnam National University","ror":"https://ror.org/05kzjxq56","country_code":"KR","type":"education","lineage":["https://openalex.org/I111277659"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Soo-Hyung Kim","raw_affiliation_strings":["Chonnam National University,Gwangju,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Chonnam National University,Gwangju,Republic of Korea","institution_ids":["https://openalex.org/I111277659"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5016273080"],"corresponding_institution_ids":["https://openalex.org/I111277659"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32115677,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.6179999709129333},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.536899983882904},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5041000247001648},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.4503999948501587},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.3889999985694885},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.34850001335144043},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.3337000012397766},{"id":"https://openalex.org/keywords/noise-shaping","display_name":"Noise shaping","score":0.2985999882221222}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.796500027179718},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.6179999709129333},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5425999760627747},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.536899983882904},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5041000247001648},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.4503999948501587},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.41659998893737793},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40689998865127563},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.3889999985694885},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.34850001335144043},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.3337000012397766},{"id":"https://openalex.org/C9083635","wikidata":"https://www.wikidata.org/wiki/Q2133535","display_name":"Noise shaping","level":2,"score":0.2985999882221222},{"id":"https://openalex.org/C87687168","wikidata":"https://www.wikidata.org/wiki/Q173114","display_name":"Digital audio","level":4,"score":0.29440000653266907},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.2921000123023987},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C114996537","wikidata":"https://www.wikidata.org/wiki/Q4854529","display_name":"Colors of noise","level":3,"score":0.2808000147342682},{"id":"https://openalex.org/C191287063","wikidata":"https://www.wikidata.org/wiki/Q543281","display_name":"Glitch","level":3,"score":0.2799000144004822},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.27070000767707825},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.257999986410141},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2522999942302704},{"id":"https://openalex.org/C2780297707","wikidata":"https://www.wikidata.org/wiki/Q4895393","display_name":"Landmark","level":2,"score":0.25110000371932983}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/avss65446.2025.11149961","is_oa":false,"landing_page_url":"https://doi.org/10.1109/avss65446.2025.11149961","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Advanced Visual and Signal-Based Systems (AVSS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W2030931454","https://openalex.org/W2949662773","https://openalex.org/W2963081548","https://openalex.org/W3019952993","https://openalex.org/W3036601975","https://openalex.org/W3097777922","https://openalex.org/W3099284785","https://openalex.org/W3104792420","https://openalex.org/W3109114891","https://openalex.org/W3121370741","https://openalex.org/W3174763799","https://openalex.org/W4200295417","https://openalex.org/W4312497550","https://openalex.org/W4385245566","https://openalex.org/W4385682917","https://openalex.org/W4386598361","https://openalex.org/W4389049828","https://openalex.org/W4394597549","https://openalex.org/W4396821501","https://openalex.org/W4403791312"],"related_works":[],"abstract_inverted_index":{"Audio-driven":[0],"talking":[1],"head":[2],"generation":[3],"requires":[4],"precise":[5],"synchronization":[6,21],"between":[7],"facial":[8,42,96],"animations":[9],"and":[10,26,57,69,92,109],"audio":[11,40,50],"signals.":[12],"This":[13,98],"paper":[14],"introduces":[15],"ATL-Diff,":[16],"a":[17,35,44,58],"novel":[18],"approach":[19,47,82],"addressing":[20],"limitations":[22],"while":[23],"reducing":[24],"noise":[25,53],"computational":[27,90],"costs.":[28],"Our":[29,81],"framework":[30],"features":[31],"three":[32],"key":[33],"components:":[34],"Landmark":[36],"Generation":[37],"Module":[38],"converting":[39],"to":[41,55],"landmarks,":[43,56],"Landmarks-Guide":[45],"Noise":[46],"that":[48,73],"decouples":[49],"by":[51],"distributing":[52],"according":[54],"3D":[59],"Identity":[60],"Diffusion":[61],"network":[62],"preserving":[63],"identity":[64],"characteristics.":[65],"Experiments":[66],"on":[67],"MEAD":[68],"CREMA-D":[70],"datasets":[71],"demonstrate":[72],"ATL-Diff":[74],"outperforms":[75],"state-of-the-art":[76],"methods":[77],"across":[78],"all":[79],"metrics.":[80],"achieves":[83],"near":[84],"real-time":[85],"processing":[86],"with":[87],"high-quality":[88],"animations,":[89],"efficiency,":[91],"exceptional":[93],"preservation":[94],"of":[95],"nuances.":[97],"advancement":[99],"offers":[100],"promising":[101],"applications":[102],"for":[103],"virtual":[104],"assistants,":[105],"education,":[106],"medical":[107],"communication,":[108],"digital":[110],"platforms.":[111],"The":[112],"source":[113],"code":[114],"is":[115],"available":[116],"at:":[117],"https://github.com/sonvth/ATL-Diff":[118]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
