{"id":"https://openalex.org/W7119055293","doi":"https://doi.org/10.48550/arxiv.2601.01847","title":"ESGaussianFace: Emotional and Stylized Audio-Driven Facial Animation via 3D Gaussian Splatting","display_name":"ESGaussianFace: Emotional and Stylized Audio-Driven Facial Animation via 3D Gaussian Splatting","publication_year":2026,"publication_date":"2026-01-05","ids":{"openalex":"https://openalex.org/W7119055293","doi":"https://doi.org/10.48550/arxiv.2601.01847"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.01847","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01847","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.01847","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121840579","display_name":"Chuhang Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ma, Chuhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122065042","display_name":"Shuai Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Shuai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122173947","display_name":"Ye Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Ye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041462553","display_name":"Jiaolong Yang","orcid":"https://orcid.org/0009-0006-1876-8976"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jiaolong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5122084294","display_name":"Xin Tong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tong, Xin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5121840579"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.7912999987602234,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.7912999987602234,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.16269999742507935,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.01140000019222498,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stylized-fact","display_name":"Stylized fact","score":0.8066999912261963},{"id":"https://openalex.org/keywords/facial-expression","display_name":"Facial expression","score":0.6388000249862671},{"id":"https://openalex.org/keywords/computer-facial-animation","display_name":"Computer facial animation","score":0.5687999725341797},{"id":"https://openalex.org/keywords/animation","display_name":"Animation","score":0.5044999718666077},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4702000021934509},{"id":"https://openalex.org/keywords/landmark","display_name":"Landmark","score":0.420199990272522},{"id":"https://openalex.org/keywords/gaussian","display_name":"Gaussian","score":0.41679999232292175},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.4163999855518341}],"concepts":[{"id":"https://openalex.org/C38935604","wikidata":"https://www.wikidata.org/wiki/Q4330363","display_name":"Stylized fact","level":2,"score":0.8066999912261963},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7840999960899353},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6421999931335449},{"id":"https://openalex.org/C195704467","wikidata":"https://www.wikidata.org/wiki/Q327968","display_name":"Facial expression","level":2,"score":0.6388000249862671},{"id":"https://openalex.org/C138591656","wikidata":"https://www.wikidata.org/wiki/Q5157538","display_name":"Computer facial animation","level":4,"score":0.5687999725341797},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.5044999718666077},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4819999933242798},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4702000021934509},{"id":"https://openalex.org/C2780297707","wikidata":"https://www.wikidata.org/wiki/Q4895393","display_name":"Landmark","level":2,"score":0.420199990272522},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.41679999232292175},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.4163999855518341},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.361299991607666},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.35429999232292175},{"id":"https://openalex.org/C69369342","wikidata":"https://www.wikidata.org/wiki/Q1401416","display_name":"Computer animation","level":3,"score":0.34880000352859497},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.31700000166893005},{"id":"https://openalex.org/C83248878","wikidata":"https://www.wikidata.org/wiki/Q344000","display_name":"Active appearance model","level":3,"score":0.31189998984336853},{"id":"https://openalex.org/C143110190","wikidata":"https://www.wikidata.org/wiki/Q5373787","display_name":"Emotional expression","level":2,"score":0.3116999864578247},{"id":"https://openalex.org/C61326573","wikidata":"https://www.wikidata.org/wiki/Q1496376","display_name":"Gaussian process","level":3,"score":0.30880001187324524},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29820001125335693},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.27900001406669617},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C98907195","wikidata":"https://www.wikidata.org/wiki/Q5428562","display_name":"Facial motion capture","level":5,"score":0.2567000091075897},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2531999945640564}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.01847","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01847","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.01847","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01847","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Most":[0],"current":[1],"audio-driven":[2,59],"facial":[3,22,60,107],"animation":[4],"research":[5],"primarily":[6],"focuses":[7],"on":[8],"generating":[9,29],"videos":[10,23,33],"with":[11,94],"neutral":[12],"emotions.":[13],"While":[14],"some":[15],"studies":[16],"have":[17],"addressed":[18],"the":[19,101,122,146,150],"generation":[20,77],"of":[21,78,121,149,183],"driven":[24],"by":[25],"emotional":[26,37,56,111,117,154],"audio,":[27],"efficiently":[28],"high-quality":[30],"talking":[31],"head":[32],"that":[34,89,174],"integrate":[35],"both":[36],"expressions":[38],"and":[39,57,72,118,128,156,167,189],"style":[40,129,157,190],"features":[41,93],"remains":[42],"a":[43,141],"significant":[44],"challenge.":[45],"In":[46],"this":[47],"paper,":[48],"we":[49,131,139],"propose":[50,83,140],"ESGaussianFace,":[51],"an":[52,84],"innovative":[53],"framework":[54],"for":[55],"stylized":[58,119],"animation.":[61],"Our":[62,159],"approach":[63],"leverages":[64],"3D":[65,70,79,123,134,168],"Gaussian":[66,124,135],"Splatting":[67],"to":[68,105],"reconstruct":[69,106],"scenes":[71],"render":[73],"videos,":[74],"ensuring":[75],"efficient":[76],"consistent":[80],"results.":[81],"We":[82],"emotion-audio-guided":[85],"spatial":[86],"attention":[87],"method":[88,176],"effectively":[90],"integrates":[91],"emotion":[92,127],"audio":[95],"content":[96],"features.":[97,158],"Through":[98],"emotion-guided":[99],"attention,":[100],"model":[102],"is":[103],"able":[104],"details":[108],"across":[109],"different":[110],"states":[112],"more":[113],"accurately.":[114],"To":[115],"achieve":[116],"deformations":[120],"points":[125],"through":[126],"features,":[130],"introduce":[132],"two":[133],"deformation":[136],"predictors.":[137],"Futhermore,":[138],"multi-stage":[142],"training":[143],"strategy,":[144],"enabling":[145],"step-by-step":[147],"learning":[148],"character's":[151],"lip":[152,184],"movements,":[153],"variations,":[155],"generated":[160],"results":[161,172],"exhibit":[162],"high":[163,165],"efficiency,":[164],"quality,":[166],"consistency.":[169],"Extensive":[170],"experimental":[171],"demonstrate":[173],"our":[175],"outperforms":[177],"existing":[178],"state-of-the-art":[179],"techniques":[180],"in":[181],"terms":[182],"movement":[185],"accuracy,":[186],"expression":[187],"variation,":[188],"feature":[191],"expressiveness.":[192]},"counts_by_year":[],"updated_date":"2026-01-08T20:10:11.968330","created_date":"2026-01-08T00:00:00"}
