{"id":"https://openalex.org/W4406461764","doi":"https://doi.org/10.1109/slt61566.2024.10832287","title":"Emotion-Coherent Speech Data Augmentation And Self-Supervised Contrastive Style Training For Enhancing Kids\u2019s Story Speech Synthesis","display_name":"Emotion-Coherent Speech Data Augmentation And Self-Supervised Contrastive Style Training For Enhancing Kids\u2019s Story Speech Synthesis","publication_year":2024,"publication_date":"2024-12-02","ids":{"openalex":"https://openalex.org/W4406461764","doi":"https://doi.org/10.1109/slt61566.2024.10832287"},"language":"en","primary_location":{"id":"doi:10.1109/slt61566.2024.10832287","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832287","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2602.10164","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5020069661","display_name":"Raymond T. Chung","orcid":"https://orcid.org/0000-0001-7587-718X"},"institutions":[{"id":"https://openalex.org/I4210131801","display_name":"Hong Kong R&D Centre for Logistics and Supply Chain Management Enabling Technologies","ror":"https://ror.org/03nm59d75","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210131801"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Raymond Chung","raw_affiliation_strings":["Logistics and Supply Chain MultiTech R&#x0026;D Centre,Pok Fu Lam,Hong Kong"],"affiliations":[{"raw_affiliation_string":"Logistics and Supply Chain MultiTech R&#x0026;D Centre,Pok Fu Lam,Hong Kong","institution_ids":["https://openalex.org/I4210131801"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5020069661"],"corresponding_institution_ids":["https://openalex.org/I4210131801"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.2336663,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"735","last_page":"741"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.9898999929428101,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6928563117980957},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6442288160324097},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.6062630414962769},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.537034273147583},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.52215176820755},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.5193532705307007},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.5068073868751526},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4179084002971649},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.362068235874176}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6928563117980957},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6442288160324097},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.6062630414962769},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.537034273147583},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.52215176820755},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.5193532705307007},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.5068073868751526},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4179084002971649},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.362068235874176},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/slt61566.2024.10832287","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832287","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2602.10164","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2602.10164","pdf_url":"https://arxiv.org/pdf/2602.10164","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:doi:10.48550/arxiv.2602.10164","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2602.10164","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2602.10164","pdf_url":"https://arxiv.org/pdf/2602.10164","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320326427","display_name":"Innovation and Technology Fund","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":39,"referenced_works":["https://openalex.org/W2747874407","https://openalex.org/W2786818226","https://openalex.org/W2885800352","https://openalex.org/W2891575196","https://openalex.org/W2896457183","https://openalex.org/W2932022923","https://openalex.org/W2963300588","https://openalex.org/W2964243274","https://openalex.org/W2966387353","https://openalex.org/W2967220154","https://openalex.org/W2972702018","https://openalex.org/W3015645837","https://openalex.org/W3015922793","https://openalex.org/W3033411150","https://openalex.org/W3036601975","https://openalex.org/W3198608154","https://openalex.org/W4200047557","https://openalex.org/W4205742757","https://openalex.org/W4224919704","https://openalex.org/W4288089799","https://openalex.org/W4296068797","https://openalex.org/W4297841714","https://openalex.org/W4320009994","https://openalex.org/W4367000116","https://openalex.org/W4372266971","https://openalex.org/W4385822941","https://openalex.org/W4385993887","https://openalex.org/W4385993905","https://openalex.org/W6750489868","https://openalex.org/W6755207826","https://openalex.org/W6760861152","https://openalex.org/W6769627184","https://openalex.org/W6774314701","https://openalex.org/W6778823374","https://openalex.org/W6780218876","https://openalex.org/W6847632070","https://openalex.org/W6849618266","https://openalex.org/W6851684569","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W230091440","https://openalex.org/W2233261550","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W2997094352","https://openalex.org/W2356229341","https://openalex.org/W3216976533","https://openalex.org/W100620283","https://openalex.org/W4394050964","https://openalex.org/W2551249631"],"abstract_inverted_index":{"Expressive":[0],"speech":[1,41,76,124],"synthesis":[2],"requires":[3],"vibrant":[4],"prosody":[5],"and":[6,129],"well-timed":[7],"pauses.":[8],"We":[9,26,55],"propose":[10],"an":[11,21],"effective":[12],"strategy":[13],"to":[14,19,61,96,114],"augment":[15],"a":[16,34,97,109],"small":[17],"dataset":[18],"train":[20],"expressive":[22,40],"end-to-end":[23],"Text-to-Speech":[24],"model.":[25],"merge":[27],"audios":[28],"of":[29,90],"emotionally":[30],"congruent":[31],"text":[32,35],"using":[33],"emotion":[36],"recognizer,":[37],"creating":[38],"augmented":[39],"data.":[42],"By":[43],"training":[44,60],"with":[45,101],"two-sentence":[46,103],"audio,":[47],"our":[48,72,91,122],"model":[49,73,99],"learns":[50],"natural":[51],"breaks":[52],"between":[53],"lines.":[54],"further":[56],"apply":[57],"self-supervised":[58],"contrastive":[59],"improve":[62],"the":[63,82,88,115,133],"speaking":[64,84],"style":[65,130],"embedding":[66],"extraction":[67],"from":[68],"speech.":[69,118],"During":[70],"inference,":[71],"produces":[74],"multi-sentence":[75],"in":[77,127],"one":[78],"step,":[79],"guided":[80],"by":[81],"text-predicted":[83],"style.":[85],"Evaluations":[86],"showcase":[87],"effectiveness":[89],"proposed":[92],"approach":[93],"when":[94],"compared":[95],"baseline":[98],"trained":[100],"consecutive":[102],"audio.":[104],"Our":[105],"synthesized":[106,123],"speeches":[107],"give":[108],"closer":[110],"inter-sentence":[111],"pause":[112],"distribution":[113],"ground":[116],"truth":[117],"Subjective":[119],"evaluations":[120],"reveal":[121],"scored":[125],"higher":[126],"naturalness":[128],"suitability":[131],"than":[132],"baseline.":[134]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
