{"id":"https://openalex.org/W4416789145","doi":"https://doi.org/10.1145/3769047.3769049","title":"PhonemeNet: A Transformer Pipeline for Text-Driven Facial Animation","display_name":"PhonemeNet: A Transformer Pipeline for Text-Driven Facial Animation","publication_year":2025,"publication_date":"2025-11-28","ids":{"openalex":"https://openalex.org/W4416789145","doi":"https://doi.org/10.1145/3769047.3769049"},"language":"en","primary_location":{"id":"doi:10.1145/3769047.3769049","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3769047.3769049","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 18th ACM SIGGRAPH Conference on Motion, Interaction, and Games","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3769047.3769049","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5021147268","display_name":"Philine Witzig","orcid":"https://orcid.org/0009-0004-9782-8118"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Philine Witzig","raw_affiliation_strings":["Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland"],"raw_orcid":"https://orcid.org/0009-0004-9782-8118","affiliations":[{"raw_affiliation_string":"Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069878201","display_name":"Barbara Solenthaler","orcid":"https://orcid.org/0000-0001-7494-8660"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Barbara Solenthaler","raw_affiliation_strings":["Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland"],"raw_orcid":"https://orcid.org/0000-0001-7494-8660","affiliations":[{"raw_affiliation_string":"Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033076979","display_name":"Markus Gro\u00df","orcid":"https://orcid.org/0009-0003-9324-779X"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Markus Gross","raw_affiliation_strings":["Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland"],"raw_orcid":"https://orcid.org/0009-0003-9324-779X","affiliations":[{"raw_affiliation_string":"Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5015553607","display_name":"Rafael Wampfler","orcid":"https://orcid.org/0000-0003-0158-1305"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Rafael Wampfler","raw_affiliation_strings":["Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland"],"raw_orcid":"https://orcid.org/0000-0003-0158-1305","affiliations":[{"raw_affiliation_string":"Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland","institution_ids":["https://openalex.org/I35440088"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32928551,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.7476000189781189,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.7476000189781189,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.21330000460147858,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.012199999764561653,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sync","display_name":"sync","score":0.6675999760627747},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.65420001745224},{"id":"https://openalex.org/keywords/animation","display_name":"Animation","score":0.5636000037193298},{"id":"https://openalex.org/keywords/computer-facial-animation","display_name":"Computer facial animation","score":0.4487000107765198},{"id":"https://openalex.org/keywords/microphone","display_name":"Microphone","score":0.4343999922275543},{"id":"https://openalex.org/keywords/computer-animation","display_name":"Computer animation","score":0.3752000033855438},{"id":"https://openalex.org/keywords/decodes","display_name":"Decodes","score":0.367000013589859},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.34860000014305115}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.73089998960495},{"id":"https://openalex.org/C3913047","wikidata":"https://www.wikidata.org/wiki/Q1956265","display_name":"sync","level":3,"score":0.6675999760627747},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.65420001745224},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.5636000037193298},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5630999803543091},{"id":"https://openalex.org/C138591656","wikidata":"https://www.wikidata.org/wiki/Q5157538","display_name":"Computer facial animation","level":4,"score":0.4487000107765198},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.4343999922275543},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40880000591278076},{"id":"https://openalex.org/C69369342","wikidata":"https://www.wikidata.org/wiki/Q1401416","display_name":"Computer animation","level":3,"score":0.3752000033855438},{"id":"https://openalex.org/C2778858076","wikidata":"https://www.wikidata.org/wiki/Q5249539","display_name":"Decodes","level":3,"score":0.367000013589859},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.34860000014305115},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.335999995470047},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.32679998874664307},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.31119999289512634},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.3082999885082245},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C130727458","wikidata":"https://www.wikidata.org/wiki/Q1639109","display_name":"Coarticulation","level":3,"score":0.29319998621940613},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.25529998540878296},{"id":"https://openalex.org/C195704467","wikidata":"https://www.wikidata.org/wiki/Q327968","display_name":"Facial expression","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.2538999915122986}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3769047.3769049","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3769047.3769049","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 18th ACM SIGGRAPH Conference on Motion, Interaction, and Games","raw_type":"proceedings-article"},{"id":"pmh:doi:10.3929/ethz-c-000789227","is_oa":true,"landing_page_url":"http://hdl.handle.net/20.500.11850/789227","pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference Paper"}],"best_oa_location":{"id":"doi:10.1145/3769047.3769049","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3769047.3769049","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 18th ACM SIGGRAPH Conference on Motion, Interaction, and Games","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2468212864","https://openalex.org/W2737658251","https://openalex.org/W2739192055","https://openalex.org/W2747874407","https://openalex.org/W2981263323","https://openalex.org/W3120163087","https://openalex.org/W3154411171","https://openalex.org/W3176721746","https://openalex.org/W4200630629","https://openalex.org/W4200633626","https://openalex.org/W4238101556","https://openalex.org/W4312309978","https://openalex.org/W4372341521","https://openalex.org/W4380994134","https://openalex.org/W4385822588","https://openalex.org/W4386076250","https://openalex.org/W4388157164","https://openalex.org/W4390872742","https://openalex.org/W4400397470","https://openalex.org/W4400582137","https://openalex.org/W4400818936","https://openalex.org/W4402660084","https://openalex.org/W4402727776","https://openalex.org/W4402753806","https://openalex.org/W4403788878","https://openalex.org/W4403897432","https://openalex.org/W4404519566","https://openalex.org/W4412587677","https://openalex.org/W4412588042"],"related_works":[],"abstract_inverted_index":{"We":[0,56,106,113],"present":[1],"a":[2,30,41,52,81,128],"fully":[3],"text-driven":[4],"framework":[5],"for":[6,14],"3D":[7],"facial":[8,158],"animation":[9],"that":[10,70,135],"eliminates":[11],"the":[12,46,65,119],"need":[13],"audio":[15],"input":[16],"or":[17],"explicit":[18],"prosodic":[19],"cues.":[20],"Our":[21,132],"architecture":[22],"extracts":[23],"rich":[24],"phoneme":[25,71,101],"embeddings":[26,39,72],"from":[27],"text":[28],"using":[29],"pre-trained":[31,53],"TTS":[32],"encoder,":[33],"aligns":[34],"them":[35],"with":[36],"quantized":[37],"motion":[38,144],"via":[40],"transformer":[42,54],"decoder,":[43],"and":[44,103,111,122,143,151],"decodes":[45],"result":[47],"into":[48],"mesh":[49],"deformations":[50],"through":[51],"decoder.":[55],"explore":[57],"two":[58],"scenarios":[59],"of":[60],"our":[61,108],"pipeline:":[62],"(1)":[63],"In":[64,80],"single-subject":[66,120],"setting,":[67,83],"we":[68,89],"find":[69],"alone":[73],"can":[74],"yield":[75],"accurate":[76,115],"lip":[77,116,141],"motion.":[78],"(2)":[79],"multi-subject":[82,130],"where":[84],"speaker":[85,104],"articulation":[86],"varies":[87],"widely,":[88],"introduce":[90],"stochastic":[91],"latent":[92],"modulation":[93],"to":[94,156],"model":[95],"residual":[96],"variability":[97],"conditioned":[98],"on":[99,127],"both":[100],"context":[102],"identity.":[105],"evaluate":[107],"approach":[109],"quantitatively":[110],"qualitatively:":[112],"demonstrate":[114],"sync":[117,142],"in":[118],"case,":[121],"compare":[123],"against":[124],"audio-driven":[125,157],"baselines":[126],"large":[129],"dataset.":[131],"results":[133],"show":[134],"PhonemeNet":[136],"not":[137],"only":[138],"achieves":[139],"competitive":[140],"quality,":[145],"but":[146],"also":[147],"offers":[148],"flexibility,":[149],"modularity,":[150],"scalability":[152],"as":[153],"an":[154],"alternative":[155],"animation.":[159]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-28T00:00:00"}
