{"id":"https://openalex.org/W4416799865","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249208","title":"Emotion-Rich Cross-Speaker TTS via Contrastive Prosody Enhancement","display_name":"Emotion-Rich Cross-Speaker TTS via Contrastive Prosody Enhancement","publication_year":2025,"publication_date":"2025-10-22","ids":{"openalex":"https://openalex.org/W4416799865","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249208"},"language":null,"primary_location":{"id":"doi:10.1109/apsipaasc65261.2025.11249208","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249208","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061908942","display_name":"Jen\u2010Tzung Chien","orcid":"https://orcid.org/0000-0003-3466-8941"},"institutions":[{"id":"https://openalex.org/I148366613","display_name":"National Yang Ming Chiao Tung University","ror":"https://ror.org/00se2k293","country_code":"TW","type":"education","lineage":["https://openalex.org/I148366613"]}],"countries":["TW"],"is_corresponding":true,"raw_author_name":"Jen-Tzung Chien","raw_affiliation_strings":["Institute of Electrical and Computer Engineering, National Yang Ming Chiao Tung University,Hsinchu,Taiwan"],"affiliations":[{"raw_affiliation_string":"Institute of Electrical and Computer Engineering, National Yang Ming Chiao Tung University,Hsinchu,Taiwan","institution_ids":["https://openalex.org/I148366613"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5037618382","display_name":"Bic Ngo","orcid":"https://orcid.org/0000-0003-3667-1992"},"institutions":[{"id":"https://openalex.org/I148366613","display_name":"National Yang Ming Chiao Tung University","ror":"https://ror.org/00se2k293","country_code":"TW","type":"education","lineage":["https://openalex.org/I148366613"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Bryan Gautama Ngo","raw_affiliation_strings":["Institute of Electrical and Computer Engineering, National Yang Ming Chiao Tung University,Hsinchu,Taiwan"],"affiliations":[{"raw_affiliation_string":"Institute of Electrical and Computer Engineering, National Yang Ming Chiao Tung University,Hsinchu,Taiwan","institution_ids":["https://openalex.org/I148366613"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5061908942"],"corresponding_institution_ids":["https://openalex.org/I148366613"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.41266436,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1110","last_page":"1115"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.8162999749183655,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.8162999749183655,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.1005999967455864,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.011699999682605267,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.9261999726295471},{"id":"https://openalex.org/keywords/emotional-prosody","display_name":"Emotional prosody","score":0.570900022983551},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.45559999346733093},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.41530001163482666},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.37139999866485596},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.3102000057697296}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.9261999726295471},{"id":"https://openalex.org/C2778262033","wikidata":"https://www.wikidata.org/wiki/Q5373795","display_name":"Emotional prosody","level":3,"score":0.570900022983551},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.49480000138282776},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4925000071525574},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.45559999346733093},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.44760000705718994},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4262999892234802},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42309999465942383},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.41530001163482666},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.37139999866485596},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.3102000057697296},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2793999910354614},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.2671999931335449}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/apsipaasc65261.2025.11249208","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249208","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2092939357","https://openalex.org/W2146334809","https://openalex.org/W2903739847","https://openalex.org/W3097290232","https://openalex.org/W3116256056","https://openalex.org/W3163573274","https://openalex.org/W3198533616","https://openalex.org/W4224926192","https://openalex.org/W4225939199","https://openalex.org/W4226421465","https://openalex.org/W4297841867","https://openalex.org/W4385822470","https://openalex.org/W4391021646","https://openalex.org/W4391620705","https://openalex.org/W4392903682","https://openalex.org/W4396982391","https://openalex.org/W4400111321","https://openalex.org/W4402111729","https://openalex.org/W4405633752","https://openalex.org/W4405633932","https://openalex.org/W4405634116","https://openalex.org/W4406861196","https://openalex.org/W4408345862","https://openalex.org/W4409474293"],"related_works":[],"abstract_inverted_index":{"Cross-speaker":[0],"prosody":[1,32,58,78,108],"transfer":[2],"provides":[3],"a":[4,25,37,63],"meaningful":[5],"approach":[6,65],"to":[7,31,47,66,74,90,99,112],"develop":[8],"an":[9],"emotional":[10,72,105,128,136],"text-to-speech":[11],"(TTS)":[12],"system":[13],"where":[14],"emotionally":[15],"diverse":[16],"speech":[17,73],"data":[18],"are":[19,40],"limited":[20],"and":[21,130,147],"only":[22],"available":[23],"from":[24,71,85],"single":[26],"source":[27],"speaker.":[28],"Current":[29],"approaches":[30],"embedding":[33,79],"learning":[34,55,96],"for":[35,57,135,144],"such":[36],"challenging":[38],"task":[39],"suboptimal.":[41],"A":[42],"key":[43],"issue":[44],"is":[45,97,110],"due":[46],"the":[48,54,77,82,91,94,101,120,123],"contamination":[49],"of":[50,122,140],"non-prosodic":[51,68],"features":[52,70],"during":[53],"process":[56],"transfer.":[59],"This":[60],"paper":[61],"presents":[62],"new":[64],"disentangle":[67],"phonetic":[69],"ensure":[75],"that":[76],"exclusively":[80],"captures":[81],"prosodic":[83],"characteristics":[84],"reference":[86],"speech.":[87],"In":[88],"addition":[89],"content":[92],"disentanglement,":[93],"contrastive":[95],"merged":[98],"enhance":[100],"discrimination":[102],"among":[103],"different":[104],"features.":[106],"Contrastive":[107],"modeling":[109],"performed":[111],"facilitate":[113],"emotion-rich":[114],"cross-speaker":[115],"TTS.":[116],"Experimental":[117],"results":[118],"demonstrate":[119],"merit":[121],"proposed":[124],"method":[125],"in":[126,138],"extracting":[127],"nuances":[129],"transferring":[131],"them":[132],"across":[133],"speakers":[134],"TTS":[137],"terms":[139],"mean":[141],"opinion":[142],"score":[143],"emotion":[145],"similarity":[146],"naturalness.":[148]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-28T00:00:00"}
