{"id":"https://openalex.org/W4408354728","doi":"https://doi.org/10.1109/icassp49660.2025.10888737","title":"Emo-DPO: Controllable Emotional Speech Synthesis through Direct Preference Optimization","display_name":"Emo-DPO: Controllable Emotional Speech Synthesis through Direct Preference Optimization","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408354728","doi":"https://doi.org/10.1109/icassp49660.2025.10888737"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10888737","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888737","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101856962","display_name":"Xiaoxue Gao","orcid":"https://orcid.org/0000-0003-1920-5228"},"institutions":[{"id":"https://openalex.org/I3005327000","display_name":"Institute for Infocomm Research","ror":"https://ror.org/053rfa017","country_code":"SG","type":"facility","lineage":["https://openalex.org/I115228651","https://openalex.org/I3005327000","https://openalex.org/I91275662"]},{"id":"https://openalex.org/I115228651","display_name":"Agency for Science, Technology and Research","ror":"https://ror.org/036wvzt09","country_code":"SG","type":"government","lineage":["https://openalex.org/I115228651"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Xiaoxue Gao","raw_affiliation_strings":["Agency for Science, Technology, and Research (A*STAR),Institute for Infocomm Research (I2R),Singapore"],"affiliations":[{"raw_affiliation_string":"Agency for Science, Technology, and Research (A*STAR),Institute for Infocomm Research (I2R),Singapore","institution_ids":["https://openalex.org/I3005327000","https://openalex.org/I115228651"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100374116","display_name":"Chen Zhang","orcid":"https://orcid.org/0000-0002-4120-0151"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Chen Zhang","raw_affiliation_strings":["National University of Singapore,Department of Electrical and Computer Engineering,Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore,Department of Electrical and Computer Engineering,Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047957154","display_name":"Yiming Chen","orcid":"https://orcid.org/0009-0006-9016-6646"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yiming Chen","raw_affiliation_strings":["National University of Singapore,Department of Electrical and Computer Engineering,Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore,Department of Electrical and Computer Engineering,Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102856166","display_name":"Huayun Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I115228651","display_name":"Agency for Science, Technology and Research","ror":"https://ror.org/036wvzt09","country_code":"SG","type":"government","lineage":["https://openalex.org/I115228651"]},{"id":"https://openalex.org/I3005327000","display_name":"Institute for Infocomm Research","ror":"https://ror.org/053rfa017","country_code":"SG","type":"facility","lineage":["https://openalex.org/I115228651","https://openalex.org/I3005327000","https://openalex.org/I91275662"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Huayun Zhang","raw_affiliation_strings":["Agency for Science, Technology, and Research (A*STAR),Institute for Infocomm Research (I2R),Singapore"],"affiliations":[{"raw_affiliation_string":"Agency for Science, Technology, and Research (A*STAR),Institute for Infocomm Research (I2R),Singapore","institution_ids":["https://openalex.org/I3005327000","https://openalex.org/I115228651"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5014190404","display_name":"Nancy F. Chen","orcid":"https://orcid.org/0000-0003-0872-5877"},"institutions":[{"id":"https://openalex.org/I115228651","display_name":"Agency for Science, Technology and Research","ror":"https://ror.org/036wvzt09","country_code":"SG","type":"government","lineage":["https://openalex.org/I115228651"]},{"id":"https://openalex.org/I3005327000","display_name":"Institute for Infocomm Research","ror":"https://ror.org/053rfa017","country_code":"SG","type":"facility","lineage":["https://openalex.org/I115228651","https://openalex.org/I3005327000","https://openalex.org/I91275662"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Nancy F. Chen","raw_affiliation_strings":["Agency for Science, Technology, and Research (A*STAR),Institute for Infocomm Research (I2R),Singapore"],"affiliations":[{"raw_affiliation_string":"Agency for Science, Technology, and Research (A*STAR),Institute for Infocomm Research (I2R),Singapore","institution_ids":["https://openalex.org/I3005327000","https://openalex.org/I115228651"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101856962"],"corresponding_institution_ids":["https://openalex.org/I115228651","https://openalex.org/I3005327000"],"apc_list":null,"apc_paid":null,"fwci":7.8715,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.97351429,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9799000024795532,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9143999814987183,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.6740844249725342},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5960768461227417},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5357889533042908},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5058669447898865},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1672939956188202}],"concepts":[{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.6740844249725342},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5960768461227417},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5357889533042908},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5058669447898865},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1672939956188202},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10888737","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888737","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320320671","display_name":"National Research Foundation","ror":"https://ror.org/05s0g1g46"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W2039800941","https://openalex.org/W2056716515","https://openalex.org/W3010943931","https://openalex.org/W3015841875","https://openalex.org/W3026010363","https://openalex.org/W3135644023","https://openalex.org/W3146550708","https://openalex.org/W3160329778","https://openalex.org/W3198123658","https://openalex.org/W4205742757","https://openalex.org/W4313155892","https://openalex.org/W4313316128","https://openalex.org/W4319985616","https://openalex.org/W4375869045","https://openalex.org/W4375869364","https://openalex.org/W4382202703","https://openalex.org/W4385993826","https://openalex.org/W4389600306","https://openalex.org/W4390041933","https://openalex.org/W4393160807","https://openalex.org/W4398152753","https://openalex.org/W4401607149","https://openalex.org/W4402669711","https://openalex.org/W4402753790","https://openalex.org/W4406461501","https://openalex.org/W4406794290","https://openalex.org/W6746238782","https://openalex.org/W6750489868","https://openalex.org/W6810738896","https://openalex.org/W6848735303","https://openalex.org/W6852418670","https://openalex.org/W6861148439","https://openalex.org/W6864712218","https://openalex.org/W6864829915","https://openalex.org/W6866461573","https://openalex.org/W6868904479","https://openalex.org/W6870225445","https://openalex.org/W6873826363"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Current":[0],"emotional":[1,20,36,69,81,93],"text-to-speech":[2],"(TTS)":[3],"models":[4,31],"pre-dominantly":[5],"conduct":[6],"supervised":[7],"training":[8],"to":[9,18,66,104],"learn":[10,33],"the":[11,34,50,99,120],"conversion":[12],"from":[13],"text":[14],"and":[15,109],"desired":[16],"emotion":[17,26,42],"its":[19],"speech,":[21],"focusing":[22],"on":[23,86],"a":[24,57],"single":[25],"per":[27],"text-speech":[28],"pair.":[29],"These":[30],"only":[32],"correct":[35],"outputs":[37],"without":[38],"fully":[39],"comprehending":[40],"other":[41],"characteristics,":[43],"which":[44,61],"limits":[45],"their":[46],"capabilities":[47],"of":[48,84],"capturing":[49],"nuances":[51,70],"between":[52,71],"different":[53],"emotions.":[54],"We":[55],"propose":[56,97],"controllable":[58],"Emo-DPO":[59],"approach,":[60],"employs":[62],"direct":[63],"preference":[64],"optimization":[65],"differentiate":[67],"subtle":[68],"emotions":[72,77],"through":[73],"optimizing":[74],"towards":[75],"preferred":[76,80],"over":[78],"less":[79],"ones.":[82],"Instead":[83],"relying":[85],"traditional":[87],"neural":[88,102],"architectures":[89],"used":[90],"in":[91],"existing":[92,121],"TTS":[94],"models,":[95],"we":[96],"utilizing":[98],"emotion-aware":[100],"LLM-TTS":[101],"architecture":[103],"leverage":[105],"LLMs\u2019":[106],"in-context":[107],"learning":[108],"instruction-following":[110],"capabilities.":[111],"Comprehensive":[112],"experiments":[113],"confirm":[114],"that":[115],"our":[116],"proposed":[117],"method":[118],"outperforms":[119],"baselines.":[122]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":4}],"updated_date":"2025-12-19T19:40:27.379048","created_date":"2025-10-10T00:00:00"}
