{"id":"https://openalex.org/W4406461501","doi":"https://doi.org/10.1109/slt61566.2024.10832181","title":"Laugh Now Cry Later: Controlling Time-Varying Emotional States of Flow-Matching-Based Zero-Shot Text-To-Speech","display_name":"Laugh Now Cry Later: Controlling Time-Varying Emotional States of Flow-Matching-Based Zero-Shot Text-To-Speech","publication_year":2024,"publication_date":"2024-12-02","ids":{"openalex":"https://openalex.org/W4406461501","doi":"https://doi.org/10.1109/slt61566.2024.10832181"},"language":"en","primary_location":{"id":"doi:10.1109/slt61566.2024.10832181","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832181","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101474128","display_name":"Haibin Wu","orcid":"https://orcid.org/0000-0001-7166-5534"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":true,"raw_author_name":"Haibin Wu","raw_affiliation_strings":["National Taiwan University"],"affiliations":[{"raw_affiliation_string":"National Taiwan University","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100363489","display_name":"Xiaofei Wang","orcid":"https://orcid.org/0009-0004-6683-3969"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaofei Wang","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026088950","display_name":"\u015eefik Emre Eskimez","orcid":"https://orcid.org/0000-0001-6259-5925"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sefik Emre Eskimez","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028363114","display_name":"Manthan Thakker","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Manthan Thakker","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064232860","display_name":"Daniel M. Tompkins","orcid":"https://orcid.org/0000-0003-3209-8084"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Daniel Tompkins","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101375070","display_name":"Chung-Hsien Tsai","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chung-Hsien Tsai","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112416737","display_name":"Canrun Li","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Canrun Li","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033383677","display_name":"Zhen Xiao","orcid":"https://orcid.org/0000-0003-3832-3916"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhen Xiao","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100329353","display_name":"Sheng Zhao","orcid":"https://orcid.org/0000-0002-9624-5381"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sheng Zhao","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365053","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-1089-9748"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016279564","display_name":"Naoyuki Kanda","orcid":"https://orcid.org/0000-0002-8628-3288"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Naoyuki Kanda","raw_affiliation_strings":["Microsoft Corporation,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5101474128"],"corresponding_institution_ids":["https://openalex.org/I16733864"],"apc_list":null,"apc_paid":null,"fwci":1.8185,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.88155881,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"690","last_page":"697"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.6711075305938721},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.6327124238014221},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5509756803512573},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5310792326927185},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5251559615135193},{"id":"https://openalex.org/keywords/flow","display_name":"Flow (mathematics)","score":0.4595867693424225},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.18963831663131714},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.17324283719062805},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.11181703209877014},{"id":"https://openalex.org/keywords/mechanics","display_name":"Mechanics","score":0.09915074706077576},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.06915247440338135},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.06798514723777771}],"concepts":[{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.6711075305938721},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.6327124238014221},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5509756803512573},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5310792326927185},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5251559615135193},{"id":"https://openalex.org/C38349280","wikidata":"https://www.wikidata.org/wiki/Q1434290","display_name":"Flow (mathematics)","level":2,"score":0.4595867693424225},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.18963831663131714},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.17324283719062805},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.11181703209877014},{"id":"https://openalex.org/C57879066","wikidata":"https://www.wikidata.org/wiki/Q41217","display_name":"Mechanics","level":1,"score":0.09915074706077576},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.06915247440338135},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.06798514723777771},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C191897082","wikidata":"https://www.wikidata.org/wiki/Q11467","display_name":"Metallurgy","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt61566.2024.10832181","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832181","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":52,"referenced_works":["https://openalex.org/W97072897","https://openalex.org/W1494198834","https://openalex.org/W2149628368","https://openalex.org/W2166637769","https://openalex.org/W2742542661","https://openalex.org/W2770743791","https://openalex.org/W2795781874","https://openalex.org/W2963755523","https://openalex.org/W2970006822","https://openalex.org/W2995181338","https://openalex.org/W3036601975","https://openalex.org/W3081192838","https://openalex.org/W3135644023","https://openalex.org/W3160329778","https://openalex.org/W3197934793","https://openalex.org/W3209984917","https://openalex.org/W4210777104","https://openalex.org/W4225302959","https://openalex.org/W4252812408","https://openalex.org/W4296069154","https://openalex.org/W4303647933","https://openalex.org/W4311000453","https://openalex.org/W4313316128","https://openalex.org/W4313679638","https://openalex.org/W4319985616","https://openalex.org/W4361994820","https://openalex.org/W4372262418","https://openalex.org/W4375869364","https://openalex.org/W4385245566","https://openalex.org/W4385822379","https://openalex.org/W4389600306","https://openalex.org/W4391454536","https://openalex.org/W4391801052","https://openalex.org/W4392538788","https://openalex.org/W4392904830","https://openalex.org/W4392908892","https://openalex.org/W4402111215","https://openalex.org/W4402112120","https://openalex.org/W4402115961","https://openalex.org/W4402669711","https://openalex.org/W6603931906","https://openalex.org/W6678809451","https://openalex.org/W6746238782","https://openalex.org/W6752307458","https://openalex.org/W6767111847","https://openalex.org/W6780218876","https://openalex.org/W6846539466","https://openalex.org/W6847363464","https://openalex.org/W6848735303","https://openalex.org/W6859908464","https://openalex.org/W6861856086","https://openalex.org/W6862144568"],"related_works":["https://openalex.org/W2074502265","https://openalex.org/W4214877189","https://openalex.org/W2773965352","https://openalex.org/W2381179799","https://openalex.org/W2980279061","https://openalex.org/W2334685461","https://openalex.org/W2366718574","https://openalex.org/W2359774528","https://openalex.org/W4298312966","https://openalex.org/W2325697621"],"abstract_inverted_index":{"People":[0],"change":[1],"their":[2],"tones":[3],"of":[4,87,104],"voice,":[5],"often":[6],"accompanied":[7],"by":[8],"nonverbal":[9],"vocalizations":[10],"(NVs)":[11],"such":[12],"as":[13,62,64],"laughter":[14,65],"and":[15,59,123],"cries,":[16],"to":[17,29,67],"convey":[18],"rich":[19,33],"emotions.":[20],"However,":[21],"most":[22],"text-to-speech":[23],"(TTS)":[24],"systems":[25],"lack":[26],"the":[27,69,102],"capability":[28],"generate":[30,47,124],"speech":[31,50,77],"with":[32,51],"emotions,":[34,122],"including":[35],"NVs.":[36],"This":[37],"paper":[38],"introduces":[39],"EmoCtrl-TTS,":[40],"an":[41],"emotion-controllable":[42],"zeroshot":[43],"TTS":[44],"that":[45,97,114],"can":[46,116],"highly":[48],"emotional":[49,76],"NVs":[52,126],"for":[53,132],"any":[54],"speaker.":[55],"EmoCtrl-TTS":[56,79,98,115],"leverages":[57],"arousal":[58],"valence":[60],"values,":[61],"well":[63],"embeddings,":[66],"condition":[68],"flow-matchingbased":[70],"zero-shot":[71,128],"TTS.":[72,129],"To":[73],"achieve":[74],"high-quality":[75],"generation,":[78],"is":[80],"trained":[81],"using":[82],"more":[83],"than":[84],"27,000":[85],"hours":[86],"expressive":[88],"data":[89],"curated":[90],"based":[91],"on":[92],"pseudo-labeling.":[93],"Comprehensive":[94],"evaluations":[95],"demonstrate":[96],"excels":[99],"in":[100,107,127],"mimicking":[101],"emotions":[103],"audio":[105],"prompts":[106],"speech-to-speech":[108],"translation":[109],"scenarios.":[110],"We":[111],"also":[112],"show":[113],"capture":[117],"emotion":[118],"changes,":[119],"express":[120],"strong":[121],"various":[125],"See":[130],"https://aka.ms/emoctrl-tts":[131],"demo":[133],"samples.":[134]},"counts_by_year":[{"year":2025,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
