{"id":"https://openalex.org/W3207281888","doi":"https://doi.org/10.1145/3462244.3479883","title":"Audiovisual Speech Synthesis using Tacotron2","display_name":"Audiovisual Speech Synthesis using Tacotron2","publication_year":2021,"publication_date":"2021-10-15","ids":{"openalex":"https://openalex.org/W3207281888","doi":"https://doi.org/10.1145/3462244.3479883","mag":"3207281888"},"language":"en","primary_location":{"id":"doi:10.1145/3462244.3479883","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3462244.3479883","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Multimodal Interaction","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101807751","display_name":"Ahmed Hussen Abdelaziz","orcid":"https://orcid.org/0000-0001-8027-4666"},"institutions":[{"id":"https://openalex.org/I4210153776","display_name":"Apple (United States)","ror":"https://ror.org/059hsda18","country_code":"US","type":"company","lineage":["https://openalex.org/I4210153776"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ahmed Hussen Abdelaziz","raw_affiliation_strings":["Apple, USA"],"affiliations":[{"raw_affiliation_string":"Apple, USA","institution_ids":["https://openalex.org/I4210153776"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061966785","display_name":"Anushree Prasanna Kumar","orcid":null},"institutions":[{"id":"https://openalex.org/I4210153776","display_name":"Apple (United States)","ror":"https://ror.org/059hsda18","country_code":"US","type":"company","lineage":["https://openalex.org/I4210153776"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anushree Prasanna Kumar","raw_affiliation_strings":["Apple, USA"],"affiliations":[{"raw_affiliation_string":"Apple, USA","institution_ids":["https://openalex.org/I4210153776"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070806541","display_name":"Chloe Seivwright","orcid":null},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Chloe Seivwright","raw_affiliation_strings":["Apple, United Kingdom"],"affiliations":[{"raw_affiliation_string":"Apple, United Kingdom","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069340484","display_name":"Gabriele Fanelli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gabriele Fanelli","raw_affiliation_strings":["Apple, Switzerland"],"affiliations":[{"raw_affiliation_string":"Apple, Switzerland","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103014977","display_name":"Justin Binder","orcid":"https://orcid.org/0000-0003-0294-6601"},"institutions":[{"id":"https://openalex.org/I4210153776","display_name":"Apple (United States)","ror":"https://ror.org/059hsda18","country_code":"US","type":"company","lineage":["https://openalex.org/I4210153776"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Justin Binder","raw_affiliation_strings":["Apple, USA"],"affiliations":[{"raw_affiliation_string":"Apple, USA","institution_ids":["https://openalex.org/I4210153776"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035745788","display_name":"Yannis Stylianou","orcid":null},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yannis Stylianou","raw_affiliation_strings":["Apple, United Kingdom"],"affiliations":[{"raw_affiliation_string":"Apple, United Kingdom","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5091124471","display_name":"Sachin Kajareker","orcid":null},"institutions":[{"id":"https://openalex.org/I4210153776","display_name":"Apple (United States)","ror":"https://ror.org/059hsda18","country_code":"US","type":"company","lineage":["https://openalex.org/I4210153776"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sachin Kajareker","raw_affiliation_strings":["Apple, USA"],"affiliations":[{"raw_affiliation_string":"Apple, USA","institution_ids":["https://openalex.org/I4210153776"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5101807751"],"corresponding_institution_ids":["https://openalex.org/I4210153776"],"apc_list":null,"apc_paid":null,"fwci":1.3713,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.81703854,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"503","last_page":"511"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7758268117904663},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.7363684177398682},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7342405319213867},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.7025011777877808},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.567557692527771},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.5221496224403381},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.49851465225219727},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.4779008626937866},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4591926634311676},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.4566951394081116},{"id":"https://openalex.org/keywords/computer-facial-animation","display_name":"Computer facial animation","score":0.4352445602416992},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.37456077337265015},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.33174675703048706},{"id":"https://openalex.org/keywords/animation","display_name":"Animation","score":0.2881326675415039},{"id":"https://openalex.org/keywords/computer-animation","display_name":"Computer animation","score":0.22938618063926697}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7758268117904663},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.7363684177398682},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7342405319213867},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.7025011777877808},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.567557692527771},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.5221496224403381},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.49851465225219727},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.4779008626937866},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4591926634311676},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.4566951394081116},{"id":"https://openalex.org/C138591656","wikidata":"https://www.wikidata.org/wiki/Q5157538","display_name":"Computer facial animation","level":4,"score":0.4352445602416992},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.37456077337265015},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33174675703048706},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.2881326675415039},{"id":"https://openalex.org/C69369342","wikidata":"https://www.wikidata.org/wiki/Q1401416","display_name":"Computer animation","level":3,"score":0.22938618063926697},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3462244.3479883","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3462244.3479883","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Multimodal Interaction","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.46000000834465027,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W71477782","https://openalex.org/W132898664","https://openalex.org/W1496852504","https://openalex.org/W1569907127","https://openalex.org/W1992187125","https://openalex.org/W2000911139","https://openalex.org/W2015394094","https://openalex.org/W2039063781","https://openalex.org/W2058970032","https://openalex.org/W2062712751","https://openalex.org/W2069997605","https://openalex.org/W2070726616","https://openalex.org/W2077521262","https://openalex.org/W2106409449","https://openalex.org/W2106715340","https://openalex.org/W2107226887","https://openalex.org/W2114336453","https://openalex.org/W2120067677","https://openalex.org/W2152826865","https://openalex.org/W2187089797","https://openalex.org/W2289286917","https://openalex.org/W2295661697","https://openalex.org/W2471370490","https://openalex.org/W2515372520","https://openalex.org/W2721682741","https://openalex.org/W2737658251","https://openalex.org/W2739192055","https://openalex.org/W2804619907","https://openalex.org/W2953022181","https://openalex.org/W2963767233","https://openalex.org/W2964243274","https://openalex.org/W2972498864","https://openalex.org/W2995238198","https://openalex.org/W3017923487","https://openalex.org/W3094612121"],"related_works":["https://openalex.org/W1927421023","https://openalex.org/W10581632","https://openalex.org/W3149582125","https://openalex.org/W1984347656","https://openalex.org/W1965141925","https://openalex.org/W2465421051","https://openalex.org/W652196294","https://openalex.org/W2368700418","https://openalex.org/W2587342322","https://openalex.org/W2540115864"],"abstract_inverted_index":{"Audiovisual":[0],"speech":[1,31,71,74,102,114,171],"synthesis":[2],"involves":[3],"synthesizing":[4],"a":[5,40,45,55,65,93,97],"talking":[6,90],"face":[7,56,124],"while":[8],"maximizing":[9],"the":[10,13,35,51,70,77,85,89,108,120,123,137,147,160,181,186],"coherency":[11],"of":[12,42,47,54,88,122,177],"acoustic":[14,48,60,101,113],"and":[15,50,76,139],"visual":[16],"speech.":[17,154],"To":[18],"solve":[19],"this":[20],"problem,":[21],"we":[22,95],"propose":[23],"using":[24,107,126],"AVTacotron2,":[25],"which":[26,179],"is":[27,103,115,163,180],"an":[28,127],"end-to-end":[29,138,161],"text-to-audiovisual":[30],"synthesizer":[32],"based":[33],"on":[34,142,185],"Tacotron2":[36],"architecture.":[37],"AVTacotron2":[38],"converts":[39],"sequence":[41,46],"phonemes":[43],"into":[44],"features":[49,61],"corresponding":[52,86],"controllers":[53,80],"model.":[57],"The":[58,73,111],"output":[59,78],"are":[62,81],"passed":[63],"through":[64],"WaveRNN":[66],"model":[67,125],"to":[68,83,118,150,165,168],"reconstruct":[69],"waveform.":[72],"waveform":[75],"facial":[79],"used":[82,117],"generate":[84,151],"video":[87],"face.":[91],"As":[92],"baseline,":[94],"use":[96],"modular":[98,140],"system,":[99],"where":[100],"synthesized":[104],"from":[105,190],"text":[106],"traditional":[109],"Tacotron2.":[110],"reconstructed":[112],"then":[116],"drive":[119],"controls":[121],"independently":[128],"trained":[129],"audio-to-facial-animation":[130],"neural":[131],"network.":[132],"We":[133],"further":[134],"condition":[135],"both":[136],"approaches":[141],"emotion":[143],"embeddings":[144],"that":[145,159],"encode":[146],"required":[148],"prosody":[149],"emotional":[152],"audiovisual":[153,170],"A":[155],"comprehensive":[156],"analysis":[157],"shows":[158],"system":[162],"able":[164],"synthesize":[166],"close":[167],"human-like":[169],"with":[172],"mean":[173],"opinion":[174],"scores":[175],"(MOS)":[176],"4.1,":[178],"same":[182],"MOS":[183],"obtained":[184],"ground":[187],"truth":[188],"generated":[189],"professionally":[191],"recorded":[192],"videos.":[193]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
