{"id":"https://openalex.org/W4414360141","doi":"https://doi.org/10.24963/ijcai.2025/173","title":"GLDiTalker: Speech-Driven 3D Facial Animation with Graph Latent Diffusion Transformer","display_name":"GLDiTalker: Speech-Driven 3D Facial Animation with Graph Latent Diffusion Transformer","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https://openalex.org/W4414360141","doi":"https://doi.org/10.24963/ijcai.2025/173"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2025/173","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/173","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100929569","display_name":"Yihong Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yihong Lin","raw_affiliation_strings":["South China University of Technology"],"affiliations":[{"raw_affiliation_string":"South China University of Technology","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021141988","display_name":"Zhaoxin Fan","orcid":"https://orcid.org/0000-0002-6324-1712"},"institutions":[{"id":"https://openalex.org/I105126617","display_name":"Zhejiang International Studies University","ror":"https://ror.org/01vwvvq12","country_code":"CN","type":"education","lineage":["https://openalex.org/I105126617"]},{"id":"https://openalex.org/I4210165198","display_name":"Beijing Advanced Sciences and Innovation Center","ror":"https://ror.org/05qm21180","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165198"]},{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaoxin Fan","raw_affiliation_strings":["Beijing Advanced Innovation Center for Future Blockchain and Privacy Computing, School of Artificial Intelligence, Beihang University","Hangzhou International Innovation Institute, Beihang University"],"affiliations":[{"raw_affiliation_string":"Beijing Advanced Innovation Center for Future Blockchain and Privacy Computing, School of Artificial Intelligence, Beihang University","institution_ids":["https://openalex.org/I4210165198"]},{"raw_affiliation_string":"Hangzhou International Innovation Institute, Beihang University","institution_ids":["https://openalex.org/I105126617","https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114236922","display_name":"Xianjia Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xianjia Wu","raw_affiliation_strings":["Huawei Cloud"],"affiliations":[{"raw_affiliation_string":"Huawei Cloud","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113392257","display_name":"Lingyu Xiong","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lingyu Xiong","raw_affiliation_strings":["South China University of Technology"],"affiliations":[{"raw_affiliation_string":"South China University of Technology","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111346914","display_name":"Xiandong Li","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiandong Li","raw_affiliation_strings":["Huawei Cloud","Nanjing University"],"affiliations":[{"raw_affiliation_string":"Huawei Cloud","institution_ids":[]},{"raw_affiliation_string":"Nanjing University","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089840086","display_name":"Wenxiong Kang","orcid":"https://orcid.org/0000-0001-9023-7252"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenxiong Kang","raw_affiliation_strings":["South China University of Technology"],"affiliations":[{"raw_affiliation_string":"South China University of Technology","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Liang Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang Peng","raw_affiliation_strings":["Huawei Cloud"],"affiliations":[{"raw_affiliation_string":"Huawei Cloud","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109785527","display_name":"Songju Lei","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Songju Lei","raw_affiliation_strings":["Nanjing University"],"affiliations":[{"raw_affiliation_string":"Nanjing University","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100928264","display_name":"Xu Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang Xu","raw_affiliation_strings":["Huawei Cloud"],"affiliations":[{"raw_affiliation_string":"Huawei Cloud","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5100929569"],"corresponding_institution_ids":["https://openalex.org/I90610280"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.24106736,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1548","last_page":"1556"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9811999797821045,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9603999853134155,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/animation","display_name":"Animation","score":0.5587000250816345},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4860999882221222},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.41749998927116394},{"id":"https://openalex.org/keywords/computer-facial-animation","display_name":"Computer facial animation","score":0.39329999685287476},{"id":"https://openalex.org/keywords/active-appearance-model","display_name":"Active appearance model","score":0.3873000144958496},{"id":"https://openalex.org/keywords/virtual-reality","display_name":"Virtual reality","score":0.3765999972820282},{"id":"https://openalex.org/keywords/augmented-reality","display_name":"Augmented reality","score":0.37540000677108765},{"id":"https://openalex.org/keywords/estimator","display_name":"Estimator","score":0.37540000677108765}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7379999756813049},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6381000280380249},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.5587000250816345},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5325000286102295},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4860999882221222},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.41749998927116394},{"id":"https://openalex.org/C138591656","wikidata":"https://www.wikidata.org/wiki/Q5157538","display_name":"Computer facial animation","level":4,"score":0.39329999685287476},{"id":"https://openalex.org/C83248878","wikidata":"https://www.wikidata.org/wiki/Q344000","display_name":"Active appearance model","level":3,"score":0.3873000144958496},{"id":"https://openalex.org/C194969405","wikidata":"https://www.wikidata.org/wiki/Q170519","display_name":"Virtual reality","level":2,"score":0.3765999972820282},{"id":"https://openalex.org/C153715457","wikidata":"https://www.wikidata.org/wiki/Q254183","display_name":"Augmented reality","level":2,"score":0.37540000677108765},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.37540000677108765},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.3635999858379364},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.34700000286102295},{"id":"https://openalex.org/C90697248","wikidata":"https://www.wikidata.org/wiki/Q1062896","display_name":"Character animation","level":4,"score":0.3447999954223633},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.3393999934196472},{"id":"https://openalex.org/C69369342","wikidata":"https://www.wikidata.org/wiki/Q1401416","display_name":"Computer animation","level":3,"score":0.3156000077724457},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.296099990606308},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.2939000129699707},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C108882727","wikidata":"https://www.wikidata.org/wiki/Q2991685","display_name":"Solid modeling","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2025/173","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/173","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Speech-driven":[0],"talking":[1],"head":[2],"generation":[3],"is":[4],"a":[5,57,66,79,86],"critical":[6],"yet":[7],"challenging":[8],"task":[9],"with":[10],"applications":[11],"in":[12,136],"augmented":[13],"reality":[14],"and":[15,24,41,48,140],"virtual":[16],"human":[17],"modeling.":[18],"While":[19],"recent":[20],"approaches":[21],"using":[22],"autoregressive":[23],"diffusion-based":[25],"models":[26],"have":[27],"achieved":[28],"notable":[29],"progress,":[30],"they":[31],"often":[32],"suffer":[33],"from":[34],"modality":[35,73],"inconsistencies,":[36],"particularly":[37],"misalignment":[38,74],"between":[39],"audio":[40],"mesh,":[42],"leading":[43],"to":[44,114],"reduced":[45],"motion":[46,107,141],"diversity":[47],"lip-sync":[49,97,138],"accuracy.":[50],"To":[51],"address":[52],"this,":[53],"we":[54],"propose":[55],"GLDiTalker,":[56],"novel":[58],"speech-driven":[59],"3D":[60,119],"facial":[61,120],"animation":[62],"model":[63],"based":[64],"on":[65,124],"Graph":[67],"Latent":[68,103],"Diffusion":[69,104],"Transformer.":[70],"GLDiTalker":[71,113,129],"resolves":[72],"by":[75],"diffusing":[76],"signals":[77],"within":[78],"quantized":[80],"spatiotemporal":[81],"latent":[82],"space.":[83],"It":[84],"employs":[85],"two-stage":[87],"training":[88],"pipeline:":[89],"the":[90,100],"Graph-Enhanced":[91],"Quantized":[92],"Space":[93],"Learning":[94],"Stage":[95,105],"ensures":[96],"accuracy,":[98],"while":[99],"Space-Time":[101],"Powered":[102],"enhances":[106],"diversity.":[108,142],"Together,":[109],"these":[110],"stages":[111],"enable":[112],"generate":[115],"realistic,":[116],"temporally":[117],"stable":[118],"animations.":[121],"Extensive":[122],"evaluations":[123],"standard":[125],"benchmarks":[126],"demonstrate":[127],"that":[128],"outperforms":[130],"existing":[131],"methods,":[132],"achieving":[133],"superior":[134],"results":[135],"both":[137],"accuracy":[139]},"counts_by_year":[],"updated_date":"2026-04-16T08:26:57.006410","created_date":"2025-10-10T00:00:00"}
