{"id":"https://openalex.org/W4402390188","doi":"https://doi.org/10.1109/ialp63756.2024.10661140","title":"End-to-end Tibetan emotional speech synthesis based on Mandarin emotions transfer","display_name":"End-to-end Tibetan emotional speech synthesis based on Mandarin emotions transfer","publication_year":2024,"publication_date":"2024-08-04","ids":{"openalex":"https://openalex.org/W4402390188","doi":"https://doi.org/10.1109/ialp63756.2024.10661140"},"language":"en","primary_location":{"id":"doi:10.1109/ialp63756.2024.10661140","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/ialp63756.2024.10661140","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 International Conference on Asian Language Processing (IALP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060794156","display_name":"Weizhao Zhang","orcid":"https://orcid.org/0000-0002-3692-4921"},"institutions":[{"id":"https://openalex.org/I68986083","display_name":"Northwest Normal University","ror":"https://ror.org/00gx3j908","country_code":"CN","type":"education","lineage":["https://openalex.org/I68986083"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Weizhao Zhang","raw_affiliation_strings":["Northwest Normal University,College of Physics and Electronic Engineering,Lanzhou,China"],"affiliations":[{"raw_affiliation_string":"Northwest Normal University,College of Physics and Electronic Engineering,Lanzhou,China","institution_ids":["https://openalex.org/I68986083"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5115595118","display_name":"Wenxuan Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I68986083","display_name":"Northwest Normal University","ror":"https://ror.org/00gx3j908","country_code":"CN","type":"education","lineage":["https://openalex.org/I68986083"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenxuan Zhang","raw_affiliation_strings":["Northwest Normal University,College of Physics and Electronic Engineering,Lanzhou,China"],"affiliations":[{"raw_affiliation_string":"Northwest Normal University,College of Physics and Electronic Engineering,Lanzhou,China","institution_ids":["https://openalex.org/I68986083"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5060794156"],"corresponding_institution_ids":["https://openalex.org/I68986083"],"apc_list":null,"apc_paid":null,"fwci":0.3637,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.66580001,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"320","last_page":"325"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9733999967575073,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mandarin-chinese","display_name":"Mandarin Chinese","score":0.7952680587768555},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5454199910163879},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.48190003633499146},{"id":"https://openalex.org/keywords/transfer","display_name":"Transfer (computing)","score":0.4790459871292114},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4531914293766022},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.18355205655097961},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.057724714279174805}],"concepts":[{"id":"https://openalex.org/C138954614","wikidata":"https://www.wikidata.org/wiki/Q9192","display_name":"Mandarin Chinese","level":2,"score":0.7952680587768555},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5454199910163879},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.48190003633499146},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.4790459871292114},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4531914293766022},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.18355205655097961},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.057724714279174805},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ialp63756.2024.10661140","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/ialp63756.2024.10661140","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 International Conference on Asian Language Processing (IALP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320312274","display_name":"Northwest Normal University","ror":"https://ror.org/00gx3j908"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W2081254602","https://openalex.org/W2747329762","https://openalex.org/W2754244673","https://openalex.org/W3135644023","https://openalex.org/W3195366750","https://openalex.org/W3197034238","https://openalex.org/W3213227271","https://openalex.org/W4210777104","https://openalex.org/W4225746985","https://openalex.org/W4283832640","https://openalex.org/W4372260402","https://openalex.org/W4379193822","https://openalex.org/W4386536205","https://openalex.org/W4391021646","https://openalex.org/W6778823374","https://openalex.org/W6783867762","https://openalex.org/W6795807602","https://openalex.org/W6803159591"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2990005675","https://openalex.org/W2374317326","https://openalex.org/W1603321096","https://openalex.org/W2394766824","https://openalex.org/W2078713291","https://openalex.org/W2361574037","https://openalex.org/W2386292991","https://openalex.org/W127416991"],"abstract_inverted_index":{"Emotional":[0],"speech":[1,8,19,56,164,191,203,218,235,258],"synthesis":[2,20,57],"has":[3],"attracted":[4],"much":[5],"attention":[6],"in":[7,11,23,82,165,219,238],"synthesis,":[9],"especially":[10],"low-resource":[12],"languages":[13],"like":[14],"Tibetan.":[15],"However,":[16],"Tibetan":[17,35,55,170,180,190,220,233],"emotional":[18,36,45,54,73,140,148,155,179,212,228,251],"is":[21,132],"still":[22],"its":[24],"infancy,":[25],"facing":[26],"challenges":[27],"such":[28],"as":[29,98],"the":[30,79,83,87,103,112,117,125,137,144,197,211,216,227,232,245,250,255],"lack":[31],"of":[32,67,86,139,215,231,247,254],"available":[33],"public":[34],"datasets":[37],"and":[38,44,72,120,150,171,221,241],"issues":[39],"related":[40],"to":[41,101,106,115,123,135],"speaker":[42,118,126],"disentanglement":[43],"confusion.":[46],"To":[47],"address":[48,136],"these":[49],"problems,":[50],"we":[51,77,110,161,209],"propose":[52],"an":[53,91],"method":[58],"based":[59],"on":[60],"improved":[61],"FastSpeech2":[62,89],"training":[63],"with":[64,90],"a":[65],"mix":[66],"neutral":[68,70],"Tibetan,":[69],"Mandarin,":[71],"Mandarin":[74,172,175,202,257],"datasets.":[75],"First,":[76],"replaced":[78],"normalization":[80],"layer":[81,93],"transformer":[84],"structure":[85],"original":[88],"emotion-conditioned":[92],"normalization(ELN),":[94],"using":[95],"emotion":[96,121],"embeddings":[97],"conditional":[99],"inputs":[100],"improve":[102],"model\u2019s":[104],"ability":[105],"learn":[107],"emotions.":[108,264],"Then,":[109],"used":[111],"orthogonal":[113,130],"loss":[114,131],"disentangle":[116],"vector":[119,122],"alleviate":[124],"leakage":[127],"problem.":[128],"Additionally,":[129,208],"also":[133],"employed":[134],"problem":[138],"confusion":[141],"by":[142,173],"enhancing":[143],"correlation":[145],"between":[146],"similar":[147],"features":[149],"ensuring":[151],"independence":[152],"among":[153],"different":[154],"features.":[156],"Experimental":[157],"results":[158,224],"showed":[159,225],"that":[160,226],"successfully":[162],"synthesized":[163,189,201,217,234,256],"five":[166,263],"emotions":[167,176,243],"for":[168,187,199,244],"both":[169],"transferring":[174],"without":[177],"any":[178],"dataset.":[181],"The":[182,223],"mean":[183],"opinion":[184],"score":[185],"(MOS)":[186],"all":[188,200,262],"was":[192,204],"3.87":[193],"or":[194,206],"higher,":[195],"while":[196,249],"MOS":[198],"3.81":[205],"higher.":[207],"evaluated":[210],"transmission":[213,229,252],"accuracy":[214,230,253],"Mandarin.":[222],"exceeded":[236,259],"80%":[237],"neutral,":[239],"sad,":[240],"surprised":[242],"majority":[246],"speakers,":[248],"69%":[260],"across":[261]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-01-13T01:12:25.745995","created_date":"2025-10-10T00:00:00"}
