{"id":"https://openalex.org/W4399096462","doi":"https://doi.org/10.1186/s13636-024-00351-9","title":"Optimizing feature fusion for improved zero-shot adaptation in text-to-speech synthesis","display_name":"Optimizing feature fusion for improved zero-shot adaptation in text-to-speech synthesis","publication_year":2024,"publication_date":"2024-05-28","ids":{"openalex":"https://openalex.org/W4399096462","doi":"https://doi.org/10.1186/s13636-024-00351-9"},"language":"en","primary_location":{"id":"doi:10.1186/s13636-024-00351-9","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13636-024-00351-9","pdf_url":"https://asmp-eurasipjournals.springeropen.com/counter/pdf/10.1186/s13636-024-00351-9","source":{"id":"https://openalex.org/S19605986","display_name":"EURASIP Journal on Audio Speech and Music Processing","issn_l":"1687-4714","issn":["1687-4714","1687-4722"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319965","host_organization_name":"Springer Nature","host_organization_lineage":["https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"EURASIP Journal on Audio, Speech, and Music Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://asmp-eurasipjournals.springeropen.com/counter/pdf/10.1186/s13636-024-00351-9","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057534444","display_name":"Zhiyong Chen","orcid":"https://orcid.org/0000-0002-9629-6111"},"institutions":[{"id":"https://openalex.org/I113940042","display_name":"Shanghai University","ror":"https://ror.org/006teas31","country_code":"CN","type":"education","lineage":["https://openalex.org/I113940042"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhiyong Chen","raw_affiliation_strings":["School of Communication and Information Engineering, Shanghai University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Communication and Information Engineering, Shanghai University, Shanghai, China","institution_ids":["https://openalex.org/I113940042"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113231440","display_name":"Zhiqi Ai","orcid":"https://orcid.org/0009-0005-1034-9972"},"institutions":[{"id":"https://openalex.org/I113940042","display_name":"Shanghai University","ror":"https://ror.org/006teas31","country_code":"CN","type":"education","lineage":["https://openalex.org/I113940042"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiqi Ai","raw_affiliation_strings":["School of Communication and Information Engineering, Shanghai University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Communication and Information Engineering, Shanghai University, Shanghai, China","institution_ids":["https://openalex.org/I113940042"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000212240","display_name":"Youxuan Ma","orcid":null},"institutions":[{"id":"https://openalex.org/I113940042","display_name":"Shanghai University","ror":"https://ror.org/006teas31","country_code":"CN","type":"education","lineage":["https://openalex.org/I113940042"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Youxuan Ma","raw_affiliation_strings":["School of Communication and Information Engineering, Shanghai University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Communication and Information Engineering, Shanghai University, Shanghai, China","institution_ids":["https://openalex.org/I113940042"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015889784","display_name":"Xinnuo Li","orcid":null},"institutions":[{"id":"https://openalex.org/I113940042","display_name":"Shanghai University","ror":"https://ror.org/006teas31","country_code":"CN","type":"education","lineage":["https://openalex.org/I113940042"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinnuo Li","raw_affiliation_strings":["School of Communication and Information Engineering, Shanghai University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Communication and Information Engineering, Shanghai University, Shanghai, China","institution_ids":["https://openalex.org/I113940042"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5084521046","display_name":"Shugong Xu","orcid":"https://orcid.org/0000-0003-1905-6269"},"institutions":[{"id":"https://openalex.org/I113940042","display_name":"Shanghai University","ror":"https://ror.org/006teas31","country_code":"CN","type":"education","lineage":["https://openalex.org/I113940042"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shugong Xu","raw_affiliation_strings":["School of Communication and Information Engineering, Shanghai University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Communication and Information Engineering, Shanghai University, Shanghai, China","institution_ids":["https://openalex.org/I113940042"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5057534444"],"corresponding_institution_ids":["https://openalex.org/I113940042"],"apc_list":{"value":1115,"currency":"GBP","value_usd":1367},"apc_paid":{"value":1115,"currency":"GBP","value_usd":1367},"fwci":1.772,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.86543249,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":97},"biblio":{"volume":"2024","issue":"1","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.7312259674072266},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6460769176483154},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.6236356496810913},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.6153648495674133},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.5566956400871277},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5436150431632996},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48281344771385193},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.418855220079422},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.40752658247947693},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.38985341787338257},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.23385685682296753},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.18746304512023926}],"concepts":[{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.7312259674072266},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6460769176483154},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.6236356496810913},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.6153648495674133},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.5566956400871277},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5436150431632996},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48281344771385193},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.418855220079422},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.40752658247947693},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.38985341787338257},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.23385685682296753},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.18746304512023926},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1186/s13636-024-00351-9","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13636-024-00351-9","pdf_url":"https://asmp-eurasipjournals.springeropen.com/counter/pdf/10.1186/s13636-024-00351-9","source":{"id":"https://openalex.org/S19605986","display_name":"EURASIP Journal on Audio Speech and Music Processing","issn_l":"1687-4714","issn":["1687-4714","1687-4722"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319965","host_organization_name":"Springer Nature","host_organization_lineage":["https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"EURASIP Journal on Audio, Speech, and Music Processing","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:e08805403ca34028a8d742b9a22ab0cc","is_oa":true,"landing_page_url":"https://doaj.org/article/e08805403ca34028a8d742b9a22ab0cc","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"EURASIP Journal on Audio, Speech, and Music Processing, Vol 2024, Iss 1, Pp 1-18 (2024)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1186/s13636-024-00351-9","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s13636-024-00351-9","pdf_url":"https://asmp-eurasipjournals.springeropen.com/counter/pdf/10.1186/s13636-024-00351-9","source":{"id":"https://openalex.org/S19605986","display_name":"EURASIP Journal on Audio Speech and Music Processing","issn_l":"1687-4714","issn":["1687-4714","1687-4722"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319965","host_organization_name":"Springer Nature","host_organization_lineage":["https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"EURASIP Journal on Audio, Speech, and Music Processing","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.47999998927116394,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1995829561","display_name":null,"funder_award_id":"Grant FS0AA-KJ919- 4402-0060","funder_id":"https://openalex.org/F4320327067","funder_display_name":"Foshan Science and Technology Bureau"},{"id":"https://openalex.org/G4682521654","display_name":null,"funder_award_id":"Grant 2020B0101130012","funder_id":"https://openalex.org/F4320336405","funder_display_name":"Special Project for Research and Development in Key areas of Guangdong Province"},{"id":"https://openalex.org/G5909354252","display_name":null,"funder_award_id":"Grants 61871262, 61901251, and 62071284","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7004910593","display_name":null,"funder_award_id":"Grants 21ZR1422400, 20JC1416400 and 20511106603","funder_id":"https://openalex.org/F4320321885","funder_display_name":"Science and Technology Commission of Shanghai Municipality"}],"funders":[{"id":"https://openalex.org/F4320316073","display_name":"Government of Pudong New Area","ror":null},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320321885","display_name":"Science and Technology Commission of Shanghai Municipality","ror":"https://ror.org/03kt66j61"},{"id":"https://openalex.org/F4320327067","display_name":"Foshan Science and Technology Bureau","ror":null},{"id":"https://openalex.org/F4320336405","display_name":"Special Project for Research and Development in Key areas of Guangdong Province","ror":null}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4399096462.pdf"},"referenced_works_count":24,"referenced_works":["https://openalex.org/W1861172732","https://openalex.org/W1999885698","https://openalex.org/W2107860279","https://openalex.org/W2111284386","https://openalex.org/W2144994235","https://openalex.org/W2166469361","https://openalex.org/W2808706139","https://openalex.org/W2964243274","https://openalex.org/W2972359262","https://openalex.org/W3015826515","https://openalex.org/W3162794600","https://openalex.org/W3198533616","https://openalex.org/W3213984348","https://openalex.org/W4225746985","https://openalex.org/W4313148337","https://openalex.org/W4319586596","https://openalex.org/W4375869257","https://openalex.org/W4385822787","https://openalex.org/W4385823163","https://openalex.org/W4387595589","https://openalex.org/W6600741150","https://openalex.org/W6601949647","https://openalex.org/W6607786901","https://openalex.org/W6828894009"],"related_works":["https://openalex.org/W2997567050","https://openalex.org/W1483272040","https://openalex.org/W4283377908","https://openalex.org/W1533421371","https://openalex.org/W2003050223","https://openalex.org/W3013650182","https://openalex.org/W2989283631","https://openalex.org/W2091777911","https://openalex.org/W2766405861","https://openalex.org/W4249605382"],"abstract_inverted_index":{"Abstract":[0],"In":[1],"the":[2,71,86,142],"era":[3],"of":[4,10,62,73,118,144,185,224],"advanced":[5,116,150],"text-to-speech":[6],"(TTS)":[7],"systems":[8,56,152],"capable":[9],"generating":[11],"high-fidelity,":[12],"human-like":[13],"speech":[14,40],"by":[15,146],"referring":[16],"a":[17,50,92,109,122,131,173],"reference":[18,47],"speech,":[19],"voice":[20,76,97,170],"cloning":[21,98,171],"(VC),":[22],"or":[23,105,182],"zero-shot":[24,181,198],"TTS":[25,151,187],"(ZS-TTS),":[26],"stands":[27],"out":[28],"as":[29],"an":[30,115],"important":[31],"subtask.":[32],"A":[33,222],"primary":[34],"challenge":[35],"in":[36,197],"VC":[37,55,204],"is":[38,214,227],"maintaining":[39],"quality":[41],"and":[42,78,94,127,156,219,238],"speaker":[43,64,67,104,138,175],"similarity":[44],"with":[45,176,207],"limited":[46],"data":[48,178],"for":[49,66,108,172],"specific":[51],"speaker.":[52,111],"However,":[53],"existing":[54,202],"often":[57],"rely":[58],"on":[59],"naive":[60],"combinations":[61],"embedded":[63],"vectors":[65],"control,":[68],"which":[69],"compromises":[70],"capture":[72],"speaking":[74],"style,":[75],"print,":[77],"semantic":[79],"accuracy.":[80],"To":[81],"overcome":[82],"this,":[83],"we":[84],"introduce":[85],"Two-branch":[87],"Speaker":[88],"Control":[89],"Module":[90],"(TSCM),":[91],"novel":[93],"highly":[95],"adaptable":[96],"module":[99],"designed":[100],"to":[101,136,201],"precisely":[102],"processing":[103],"style":[106],"control":[107],"target":[110,174],"Our":[112,211],"method":[113],"uses":[114],"fusion":[117],"local-level":[119],"features":[120,129],"from":[121,130],"Gated":[123],"Convolutional":[124],"Network":[125],"(GCN)":[126],"utterance-level":[128],"gated":[132],"recurrent":[133],"unit":[134],"(GRU)":[135],"enhance":[137],"control.":[139],"We":[140],"demonstrate":[141],"effectiveness":[143],"TSCM":[145,167],"integrating":[147],"it":[148],"into":[149,235],"like":[153],"FastSpeech":[154],"2":[155],"VITS":[157,192],"architectures,":[158],"significantly":[159],"optimizing":[160],"their":[161],"performance.":[162],"Experimental":[163],"results":[164],"show":[165],"that":[166],"enables":[168],"accurate":[169],"minimal":[177],"through":[179,216],"both":[180],"few-shot":[183],"fine-tuning":[184],"pretrained":[186],"models.":[188],"Furthermore,":[189],"our":[190,225],"TSCM-based":[191],"(TSCM-VITS)":[193],"showcases":[194],"superior":[195],"performance":[196],"scenarios":[199],"compared":[200],"state-of-the-art":[203],"systems,":[205],"even":[206],"basic":[208],"dataset":[209],"configurations.":[210],"method\u2019s":[212],"superiority":[213],"validated":[215],"comprehensive":[217],"subjective":[218],"objective":[220],"evaluations.":[221],"demonstration":[223],"system":[226],"available":[228],"at":[229],"https://great-research.github.io/tsct-tts-demo/":[230],",":[231],"providing":[232],"practical":[233],"insights":[234],"its":[236],"application":[237],"effectiveness.":[239]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
