{"id":"https://openalex.org/W4416771678","doi":"https://doi.org/10.1109/sped67700.2025.11253714","title":"Adding Emotion Conditioning in Speech Synthesis via Multi-Term Classifier-Free Guidance","display_name":"Adding Emotion Conditioning in Speech Synthesis via Multi-Term Classifier-Free Guidance","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416771678","doi":"https://doi.org/10.1109/sped67700.2025.11253714"},"language":null,"primary_location":{"id":"doi:10.1109/sped67700.2025.11253714","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sped67700.2025.11253714","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120480379","display_name":"Radu-George Bolborici","orcid":null},"institutions":[{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["RO"],"is_corresponding":true,"raw_author_name":"Radu-George Bolborici","raw_affiliation_strings":["National University of Science and Technology Politehnica,Bucharest,Romania"],"affiliations":[{"raw_affiliation_string":"National University of Science and Technology Politehnica,Bucharest,Romania","institution_ids":["https://openalex.org/I61641377"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016527260","display_name":"Ana Neac\u015fu","orcid":"https://orcid.org/0000-0001-7731-1905"},"institutions":[{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["RO"],"is_corresponding":false,"raw_author_name":"Ana Neac\u015fu","raw_affiliation_strings":["National University of Science and Technology Politehnica,Bucharest,Romania"],"affiliations":[{"raw_affiliation_string":"National University of Science and Technology Politehnica,Bucharest,Romania","institution_ids":["https://openalex.org/I61641377"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5120480379"],"corresponding_institution_ids":["https://openalex.org/I61641377"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.41008582,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"86","last_page":"91"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9139000177383423,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9139000177383423,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.03519999980926514,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.009200000204145908,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.7501000165939331},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.5264999866485596},{"id":"https://openalex.org/keywords/tone","display_name":"Tone (literature)","score":0.4602000117301941},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.38769999146461487},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.38199999928474426},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.35910001397132874},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.35679998993873596},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.3467000126838684}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.7501000165939331},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.691100001335144},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.5264999866485596},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5195000171661377},{"id":"https://openalex.org/C2780583480","wikidata":"https://www.wikidata.org/wiki/Q1366327","display_name":"Tone (literature)","level":2,"score":0.4602000117301941},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.38769999146461487},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.38199999928474426},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.35910001397132874},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.35679998993873596},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34950000047683716},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.3467000126838684},{"id":"https://openalex.org/C109747225","wikidata":"https://www.wikidata.org/wiki/Q815758","display_name":"Scarcity","level":2,"score":0.34459999203681946},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.32839998602867126},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.31869998574256897},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C43617652","wikidata":"https://www.wikidata.org/wiki/Q7575399","display_name":"Speech production","level":2,"score":0.2896000146865845},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2696000039577484},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C206310091","wikidata":"https://www.wikidata.org/wiki/Q750859","display_name":"Emotion classification","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/sped67700.2025.11253714","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sped67700.2025.11253714","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W3024869864","https://openalex.org/W3196475561","https://openalex.org/W3196584150","https://openalex.org/W3213879871","https://openalex.org/W4205742757","https://openalex.org/W4361994820","https://openalex.org/W4375869257","https://openalex.org/W4386076493","https://openalex.org/W4388979610","https://openalex.org/W4402112533","https://openalex.org/W4402115961","https://openalex.org/W4404782639","https://openalex.org/W4406461865","https://openalex.org/W4412945617"],"related_works":[],"abstract_inverted_index":{"Many":[0],"recent":[1],"Text-to-Speech":[2],"(TTS)":[3],"models":[4],"employing":[5],"zero-shot":[6],"voice":[7,115],"cloning":[8],"techniques":[9],"are":[10,176],"capable":[11],"of":[12,35,94],"reproducing":[13],"the":[14,19,33,64,70,74,88,92,101,113,117,126,138],"emotional":[15],"tone":[16],"present":[17],"in":[18,48],"reference":[20,118],"speech.":[21],"However,":[22],"they":[23],"frequently":[24],"lack":[25],"mechanisms":[26],"for":[27,44,144],"fine-grained":[28,52],"emotion":[29,53,123,135,167],"control":[30,54,133],"due":[31],"to":[32,50,62,106,125],"scarcity":[34],"labeled":[36],"resources.":[37],"The":[38],"current":[39],"work":[40],"proposes":[41],"a":[42,83],"method":[43,103,131,160],"adjusting":[45],"F5-TTS":[46],"[1]":[47],"order":[49],"achieve":[51],"by":[55,81],"fine-tuning":[56],"under":[57],"low-resource":[58],"conditions,":[59],"while":[60],"aiming":[61],"keep":[63],"original":[65],"model\u2019s":[66],"performance.":[67],"To":[68],"enforce":[69],"newly":[71],"added":[72],"condition,":[73],"classifier-free":[75],"guidance":[76],"(CFG)":[77],"mechanism":[78],"is":[79,104],"modified":[80],"adding":[82],"supplementary":[84],"term":[85],"that":[86,100,158],"amplifies":[87],"condition\u2019s":[89],"contribution,":[90],"at":[91,178],"expense":[93],"increased":[95],"inference":[96],"time.":[97],"We":[98],"show":[99],"proposed":[102],"able":[105],"generate":[107],"natural":[108],"and":[109,150,164,169,173],"intelligible":[110],"speech":[111],"with":[112,121,153],"same":[114],"as":[116],"audio,":[119],"but":[120],"changed":[122],"according":[124],"label":[127],"condition.":[128],"Moreover,":[129],"this":[130,145],"enables":[132],"over":[134],"intensity,":[136],"despite":[137],"model":[139],"not":[140],"being":[141],"explicitly":[142],"trained":[143],"task.":[146],"Extensive":[147],"automated":[148],"evaluations":[149],"subjective":[151],"tests":[152],"20":[154],"human":[155],"participants":[156],"demonstrate":[157],"our":[159],"maintains":[161],"intelligibility,":[162],"naturalness":[163],"allows":[165],"both":[166],"class":[168],"intensity":[170],"control.Implementation":[171],"code":[172],"demo":[174],"samples":[175],"available":[177],"https://github.com/RaduBolbo/F5-TTS-Emotional-CFG":[179]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-27T00:00:00"}
