{"id":"https://openalex.org/W4366493008","doi":"https://doi.org/10.1109/taslp.2023.3268571","title":"iEmoTTS: Toward Robust Cross-Speaker Emotion Transfer and Control for Speech Synthesis Based on Disentanglement Between Prosody and Timbre","display_name":"iEmoTTS: Toward Robust Cross-Speaker Emotion Transfer and Control for Speech Synthesis Based on Disentanglement Between Prosody and Timbre","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4366493008","doi":"https://doi.org/10.1109/taslp.2023.3268571"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3268571","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3268571","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5019754670","display_name":"Guangyan Zhang","orcid":"https://orcid.org/0000-0002-8640-8933"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]},{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN","HK"],"is_corresponding":true,"raw_author_name":"Guangyan Zhang","raw_affiliation_strings":["Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China","DSP & Speech Technology Laboratory, Department of Electronic Engineering, The Chinese University of Hong Kong, Hong Kong"],"affiliations":[{"raw_affiliation_string":"Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"DSP & Speech Technology Laboratory, Department of Electronic Engineering, The Chinese University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013250925","display_name":"Ying Qin","orcid":"https://orcid.org/0000-0003-4606-7174"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Qin","raw_affiliation_strings":["Institute of Information Science, Beijing Jiaotong University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Information Science, Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100385498","display_name":"Wenjie Zhang","orcid":"https://orcid.org/0000-0001-6572-2600"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenjie Zhang","raw_affiliation_strings":["Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102841028","display_name":"Jialun Wu","orcid":"https://orcid.org/0000-0002-9015-7487"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jialun Wu","raw_affiliation_strings":["Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100745806","display_name":"Mei Li","orcid":"https://orcid.org/0000-0003-2962-8945"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mei Li","raw_affiliation_strings":["Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040253510","display_name":"Yutao Gai","orcid":"https://orcid.org/0009-0002-1464-5478"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yutao Gai","raw_affiliation_strings":["Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057405217","display_name":"Feijun Jiang","orcid":"https://orcid.org/0000-0001-5579-5144"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feijun Jiang","raw_affiliation_strings":["Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Intelligent Connectivity, Cloud &amp; Technology, Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001795601","display_name":"Tan Lee","orcid":"https://orcid.org/0000-0002-7089-3436"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Tan Lee","raw_affiliation_strings":["DSP &amp; Speech Technology Laboratory, Department of Electronic Engineering, The Chinese University of Hong Kong, Hong Kong"],"affiliations":[{"raw_affiliation_string":"DSP &amp; Speech Technology Laboratory, Department of Electronic Engineering, The Chinese University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5019754670"],"corresponding_institution_ids":["https://openalex.org/I177725633","https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":5.1251,"has_fulltext":false,"cited_by_count":26,"citation_normalized_percentile":{"value":0.96436085,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":100},"biblio":{"volume":"31","issue":null,"first_page":"1693","last_page":"1705"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.993399977684021,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.8096774816513062},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6863610148429871},{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.6134330630302429},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.547150731086731},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5452495217323303}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.8096774816513062},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6863610148429871},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.6134330630302429},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.547150731086731},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5452495217323303},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2023.3268571","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3268571","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8100071792","display_name":null,"funder_award_id":"2021RC244","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"}],"funders":[{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":65,"referenced_works":["https://openalex.org/W349236604","https://openalex.org/W1570629387","https://openalex.org/W1971670143","https://openalex.org/W2085013480","https://openalex.org/W2129142580","https://openalex.org/W2134973740","https://openalex.org/W2150658333","https://openalex.org/W2154611638","https://openalex.org/W2154920538","https://openalex.org/W2294130536","https://openalex.org/W2398561585","https://openalex.org/W2747664154","https://openalex.org/W2752796333","https://openalex.org/W2785364623","https://openalex.org/W2793479148","https://openalex.org/W2888976508","https://openalex.org/W2889326414","https://openalex.org/W2907262790","https://openalex.org/W2950151997","https://openalex.org/W2950689937","https://openalex.org/W2962788625","https://openalex.org/W2962793481","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2998115938","https://openalex.org/W3008691130","https://openalex.org/W3015841875","https://openalex.org/W3022876224","https://openalex.org/W3094785744","https://openalex.org/W3096457008","https://openalex.org/W3096830101","https://openalex.org/W3135547455","https://openalex.org/W3135644023","https://openalex.org/W3139170550","https://openalex.org/W3146550708","https://openalex.org/W3150572638","https://openalex.org/W3160329778","https://openalex.org/W3162791003","https://openalex.org/W3195366750","https://openalex.org/W3197304925","https://openalex.org/W3198712562","https://openalex.org/W3206725777","https://openalex.org/W3207961486","https://openalex.org/W4210433094","https://openalex.org/W4226421465","https://openalex.org/W4234095459","https://openalex.org/W4296069154","https://openalex.org/W4385245566","https://openalex.org/W6631190155","https://openalex.org/W6640215214","https://openalex.org/W6729448088","https://openalex.org/W6730091202","https://openalex.org/W6746238782","https://openalex.org/W6749555683","https://openalex.org/W6750489868","https://openalex.org/W6752888775","https://openalex.org/W6760861152","https://openalex.org/W6762533536","https://openalex.org/W6763832098","https://openalex.org/W6765987481","https://openalex.org/W6776390925","https://openalex.org/W6778823374","https://openalex.org/W6783867762","https://openalex.org/W6796464841","https://openalex.org/W6802591955"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2406877384","https://openalex.org/W2595839522","https://openalex.org/W2529301793","https://openalex.org/W2384121599","https://openalex.org/W2038083449","https://openalex.org/W2124576126","https://openalex.org/W258725851"],"abstract_inverted_index":{"The":[0,55,70,90,106,118,206],"capability":[1],"of":[2,9,59,76,87,139,162,183,199,211,219],"generating":[3,26],"speech":[4,28,30,164,182,230],"with":[5,32,195,214,231],"a":[6,22,47,63,67,252],"specific":[7],"type":[8,78],"emotion":[10,19,33,50,61,71,77,82,91,116,148,177,221,233,237],"is":[11,21,38,57,93,109,143,157,186,192,224,245],"desired":[12],"for":[13,41,115,124,169,173],"many":[14,128],"human-computer":[15],"interaction":[16],"applications.":[17],"Cross-speaker":[18],"transfer":[20,51,149,248],"common":[23],"approach":[24],"to":[25,111,145,247,251],"emotional":[27,249],"when":[29],"data":[31],"labels":[34],"from":[35,84],"target":[36,184],"speakers":[37,185],"not":[39,187],"available":[40],"model":[42,190],"training.":[43],"This":[44],"paper":[45],"presents":[46],"novel":[48],"cross-speaker":[49,147,220],"system":[52,56],"named":[53],"iEmoTTS.":[54,196],"composed":[58],"an":[60],"encoder,":[62],"prosody":[64,107,153],"predictor,":[65],"and":[66,79,136,154,166,235],"timbre":[68,119,167],"encoder.":[69],"encoder":[72,120],"extracts":[73],"the":[74,80,85,96,100,125,141,159,170,181,209],"identity":[75],"respective":[81],"intensity":[83,92],"mel-spectrogram":[86],"input":[88,101],"speech.":[89],"measured":[94],"by":[95],"posterior":[97],"probability":[98],"that":[99,104,180,226],"utterance":[102],"carries":[103],"emotion.":[105],"predictor":[108],"used":[110],"provide":[112],"prosodic":[113],"features":[114],"transfer.":[117,222],"provides":[121],"timbre-related":[122],"information":[123,241,250],"system.":[126],"Unlike":[127],"other":[129,215],"studies":[130],"which":[131],"focus":[132],"on":[133],"disentangling":[134],"speaker":[135,174,254],"style":[137],"factors":[138],"speech,":[140],"iEmoTTS":[142,212,227,244],"designed":[144],"achieve":[146],"via":[150],"disentanglement":[151],"between":[152],"timbre.":[155],"Prosody":[156],"considered":[158],"primary":[160],"carrier":[161],"emotion-related":[163],"characteristics,":[165],"accounts":[168],"essential":[171],"characteristics":[172],"identification.":[175],"Zero-shot":[176],"transfer,":[178],"meaning":[179],"seen":[188],"in":[189],"training,":[191],"also":[193],"realized":[194],"Extensive":[197],"experiments":[198],"subjective":[200],"evaluation":[201],"have":[202],"been":[203],"carried":[204],"out.":[205],"results":[207],"demonstrate":[208],"effectiveness":[210],"compared":[213],"recently":[216],"proposed":[217],"systems":[218],"It":[223],"shown":[225],"can":[228],"produce":[229],"designated":[232],"types":[234],"controllable":[236],"intensity.":[238],"With":[239],"appropriate":[240],"bottleneck":[242],"capacity,":[243],"able":[246],"new":[253],"effectively.":[255],"Audio":[256],"samples":[257],"are":[258],"publicly":[259],"available.":[260]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":13},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":6}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
