{"id":"https://openalex.org/W4392902784","doi":"https://doi.org/10.1109/icassp48485.2024.10446852","title":"Mels-Tts : Multi-Emotion Multi-Lingual Multi-Speaker Text-To-Speech System Via Disentangled Style Tokens","display_name":"Mels-Tts : Multi-Emotion Multi-Lingual Multi-Speaker Text-To-Speech System Via Disentangled Style Tokens","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392902784","doi":"https://doi.org/10.1109/icassp48485.2024.10446852"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446852","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446852","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081997471","display_name":"Heejin Choi","orcid":"https://orcid.org/0000-0001-5093-2859"},"institutions":[{"id":"https://openalex.org/I4210101778","display_name":"Samsung (United States)","ror":"https://ror.org/01bfbvm65","country_code":"US","type":"company","lineage":["https://openalex.org/I2250650973","https://openalex.org/I4210101778"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Heejin Choi","raw_affiliation_strings":["Samsung Research"],"affiliations":[{"raw_affiliation_string":"Samsung Research","institution_ids":["https://openalex.org/I4210101778"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101771882","display_name":"Jae\u2010Sung Bae","orcid":"https://orcid.org/0000-0003-4236-2926"},"institutions":[{"id":"https://openalex.org/I4210101778","display_name":"Samsung (United States)","ror":"https://ror.org/01bfbvm65","country_code":"US","type":"company","lineage":["https://openalex.org/I2250650973","https://openalex.org/I4210101778"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jae-Sung Bae","raw_affiliation_strings":["Samsung Research"],"affiliations":[{"raw_affiliation_string":"Samsung Research","institution_ids":["https://openalex.org/I4210101778"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042018167","display_name":"Joun Yeop Lee","orcid":"https://orcid.org/0000-0002-3316-4808"},"institutions":[{"id":"https://openalex.org/I2800817003","display_name":"Southern California University for Professional Studies","ror":"https://ror.org/058zz0t50","country_code":"US","type":"education","lineage":["https://openalex.org/I2800817003"]},{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]},{"id":"https://openalex.org/I4210101778","display_name":"Samsung (United States)","ror":"https://ror.org/01bfbvm65","country_code":"US","type":"company","lineage":["https://openalex.org/I2250650973","https://openalex.org/I4210101778"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Joun Yeop Lee","raw_affiliation_strings":["Samsung Research","University of Southern California"],"affiliations":[{"raw_affiliation_string":"Samsung Research","institution_ids":["https://openalex.org/I4210101778"]},{"raw_affiliation_string":"University of Southern California","institution_ids":["https://openalex.org/I2800817003","https://openalex.org/I1174212"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021831317","display_name":"Seongkyu Mun","orcid":null},"institutions":[{"id":"https://openalex.org/I4210101778","display_name":"Samsung (United States)","ror":"https://ror.org/01bfbvm65","country_code":"US","type":"company","lineage":["https://openalex.org/I2250650973","https://openalex.org/I4210101778"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Seongkyu Mun","raw_affiliation_strings":["Samsung Research"],"affiliations":[{"raw_affiliation_string":"Samsung Research","institution_ids":["https://openalex.org/I4210101778"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100674127","display_name":"Jihwan Lee","orcid":"https://orcid.org/0000-0001-9380-5470"},"institutions":[{"id":"https://openalex.org/I2800817003","display_name":"Southern California University for Professional Studies","ror":"https://ror.org/058zz0t50","country_code":"US","type":"education","lineage":["https://openalex.org/I2800817003"]},{"id":"https://openalex.org/I4210101778","display_name":"Samsung (United States)","ror":"https://ror.org/01bfbvm65","country_code":"US","type":"company","lineage":["https://openalex.org/I2250650973","https://openalex.org/I4210101778"]},{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jihwan Lee","raw_affiliation_strings":["Samsung Research","University of Southern California"],"affiliations":[{"raw_affiliation_string":"Samsung Research","institution_ids":["https://openalex.org/I4210101778"]},{"raw_affiliation_string":"University of Southern California","institution_ids":["https://openalex.org/I2800817003","https://openalex.org/I1174212"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052468556","display_name":"Hoon Young Cho","orcid":"https://orcid.org/0000-0002-6850-6580"},"institutions":[{"id":"https://openalex.org/I4210101778","display_name":"Samsung (United States)","ror":"https://ror.org/01bfbvm65","country_code":"US","type":"company","lineage":["https://openalex.org/I2250650973","https://openalex.org/I4210101778"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hoon-Young Cho","raw_affiliation_strings":["Samsung Research"],"affiliations":[{"raw_affiliation_string":"Samsung Research","institution_ids":["https://openalex.org/I4210101778"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100684422","display_name":"Chanwoo Kim","orcid":"https://orcid.org/0000-0003-0193-8167"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chanwoo Kim","raw_affiliation_strings":["Korea University"],"affiliations":[{"raw_affiliation_string":"Korea University","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5081997471"],"corresponding_institution_ids":["https://openalex.org/I4210101778"],"apc_list":null,"apc_paid":null,"fwci":1.3264,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.82433671,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"12682","last_page":"12686"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7761199474334717},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7344382405281067},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7042262554168701},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.62906813621521},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.5904743671417236},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.5606275796890259},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.456582248210907},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4358018636703491},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.4167618751525879},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3353821039199829}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7761199474334717},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7344382405281067},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7042262554168701},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.62906813621521},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.5904743671417236},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.5606275796890259},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.456582248210907},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4358018636703491},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4167618751525879},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3353821039199829},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446852","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446852","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7300000190734863,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W2187089797","https://openalex.org/W2502312327","https://openalex.org/W2793479148","https://openalex.org/W2937870435","https://openalex.org/W2963470929","https://openalex.org/W2963964591","https://openalex.org/W2964243274","https://openalex.org/W2972473628","https://openalex.org/W2972921407","https://openalex.org/W3010916717","https://openalex.org/W3022876224","https://openalex.org/W3025075133","https://openalex.org/W3094650042","https://openalex.org/W3095389792","https://openalex.org/W3195366750","https://openalex.org/W4224861944","https://openalex.org/W4224931727","https://openalex.org/W4283832640","https://openalex.org/W4294311176","https://openalex.org/W4296068781","https://openalex.org/W4390075359","https://openalex.org/W6631190155","https://openalex.org/W6724804524","https://openalex.org/W6746238782","https://openalex.org/W6749555683","https://openalex.org/W6750489868","https://openalex.org/W6763832098","https://openalex.org/W6778823374","https://openalex.org/W6796464841","https://openalex.org/W6936113694"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2149220986","https://openalex.org/W1493012537","https://openalex.org/W4247736853","https://openalex.org/W2162158162","https://openalex.org/W1999004162","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2023466863","https://openalex.org/W2696990509"],"abstract_inverted_index":{"This":[0],"paper":[1],"proposes":[2],"a":[3,113],"multi-emotion,":[4,41],"multi-lingual,":[5,42],"and":[6,30,43,63,98],"multi-speaker":[7,44],"text-to-speech":[8],"(MELS-TTS)":[9],"system,":[10],"employing":[11],"disentangled":[12],"style":[13,56,70,78],"tokens":[14,57,71],"for":[15,38],"effective":[16],"emotion":[17],"transfer.":[18],"In":[19],"speech":[20,83,107],"encompassing":[21],"various":[22],"attributes,":[23],"such":[24],"as":[25],"emotional":[26],"state,":[27],"speaker":[28],"identity,":[29],"linguistic":[31],"style,":[32],"disentangling":[33],"these":[34],"elements":[35],"is":[36],"crucial":[37],"an":[39],"efficient":[40],"TTS":[45],"system.":[46],"To":[47],"accomplish":[48],"this":[49],"purpose,":[50],"we":[51],"propose":[52],"to":[53,58,104],"utilize":[54],"separate":[55],"disentangle":[59],"emotion,":[60],"language,":[61],"speaker,":[62,116],"residual":[64],"information,":[65],"inspired":[66],"by":[67],"the":[68,74,86,102,119],"global":[69],"(GSTs).":[72],"Through":[73],"attention":[75],"mechanism,":[76],"each":[77],"token":[79],"learns":[80],"its":[81],"respective":[82],"attribute":[84],"from":[85,112],"target":[87],"speech.":[88],"Our":[89],"proposed":[90],"approach":[91],"yields":[92],"improved":[93],"performance":[94],"in":[95],"both":[96],"objective":[97],"subjective":[99],"evaluations,":[100],"demonstrating":[101],"ability":[103],"generate":[105],"cross-lingual":[106],"with":[108],"diverse":[109],"emotions,":[110],"even":[111],"neutral":[114],"source":[115],"while":[117],"preserving":[118],"speaker\u2019s":[120],"identity.":[121]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
