{"id":"https://openalex.org/W3163677684","doi":"https://doi.org/10.1109/icassp39728.2021.9414422","title":"Bi-Level Style and Prosody Decoupling Modeling for Personalized End-to-End Speech Synthesis","display_name":"Bi-Level Style and Prosody Decoupling Modeling for Personalized End-to-End Speech Synthesis","publication_year":2021,"publication_date":"2021-05-13","ids":{"openalex":"https://openalex.org/W3163677684","doi":"https://doi.org/10.1109/icassp39728.2021.9414422","mag":"3163677684"},"language":"en","primary_location":{"id":"doi:10.1109/icassp39728.2021.9414422","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414422","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073918837","display_name":"Ruibo Fu","orcid":"https://orcid.org/0000-0001-9598-1881"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruibo Fu","raw_affiliation_strings":["National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112613657","display_name":"Jianhua Tao","orcid":"https://orcid.org/0000-0002-9344-6428"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210097554","display_name":"Center for Excellence in Brain Science and Intelligence Technology","ror":"https://ror.org/00vpwhm04","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210097554"]},{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianhua Tao","raw_affiliation_strings":["CAS Center for Excellence in Brain Science and Intelligence Technology, Beijing","National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing","School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CAS Center for Excellence in Brain Science and Intelligence Technology, Beijing","institution_ids":["https://openalex.org/I4210097554"]},{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing","institution_ids":["https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111856667","display_name":"Zhengqi Wen","orcid":null},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhengqi Wen","raw_affiliation_strings":["National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078525423","display_name":"Jiangyan Yi","orcid":"https://orcid.org/0000-0003-2422-4618"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiangyan Yi","raw_affiliation_strings":["National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100453576","display_name":"Tao Wang","orcid":"https://orcid.org/0000-0002-6785-1251"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Wang","raw_affiliation_strings":["National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing","School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing","institution_ids":["https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4210165038"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5028353824","display_name":"Chunyu Qiang","orcid":"https://orcid.org/0009-0007-2290-3074"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chunyu Qiang","raw_affiliation_strings":["National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing","School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing","institution_ids":["https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4210165038"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.06257509,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"6568","last_page":"6572"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.8511725068092346},{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.8345520496368408},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7725183963775635},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6686831712722778},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5946680307388306},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5151594877243042},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4809693396091461},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4507753849029541},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.34341830015182495}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.8511725068092346},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.8345520496368408},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7725183963775635},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6686831712722778},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5946680307388306},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5151594877243042},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4809693396091461},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4507753849029541},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34341830015182495},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp39728.2021.9414422","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414422","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/1","display_name":"No poverty","score":0.44999998807907104}],"awards":[],"funders":[{"id":"https://openalex.org/F4320316083","display_name":"Tencent","ror":"https://ror.org/00hhjss72"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320337504","display_name":"Research and Development","ror":"https://ror.org/027s68j25"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1967060135","https://openalex.org/W2129244720","https://openalex.org/W2518172956","https://openalex.org/W2519091744","https://openalex.org/W2619368999","https://openalex.org/W2794490148","https://openalex.org/W2795109282","https://openalex.org/W2808706139","https://openalex.org/W2887511658","https://openalex.org/W2963091184","https://openalex.org/W2963242190","https://openalex.org/W2963272440","https://openalex.org/W2963432880","https://openalex.org/W2963609956","https://openalex.org/W2963691546","https://openalex.org/W2963927338","https://openalex.org/W2964243274","https://openalex.org/W3015796413","https://openalex.org/W3015826515","https://openalex.org/W3015938946","https://openalex.org/W4295731579","https://openalex.org/W6749489859","https://openalex.org/W6750489868","https://openalex.org/W6752888775","https://openalex.org/W6765987481"],"related_works":["https://openalex.org/W3100825170","https://openalex.org/W3134175397","https://openalex.org/W1914543332","https://openalex.org/W1927421023","https://openalex.org/W10581632","https://openalex.org/W2108985546","https://openalex.org/W3203313352","https://openalex.org/W3149582125","https://openalex.org/W2077992636","https://openalex.org/W1984347656"],"abstract_inverted_index":{"End-to-end":[0],"framework":[1,51],"can":[2],"generate":[3],"high-quality":[4],"and":[5,32,35,56,92,103,126,133,162],"high-similarity":[6],"speech":[7,11],"in":[8,140],"the":[9,15,39,64,71,81,89,100,114,118,129,151],"personalized":[10],"synthesis":[12],"task.":[13,24],"However,":[14],"generalization":[16],"of":[17,38],"out-of-domain":[18],"texts":[19],"is":[20,86,122],"still":[21],"a":[22,47,108],"challenging":[23],"Limited":[25],"target":[26],"data":[27],"leads":[28],"to":[29,52,78,155],"unacceptable":[30],"errors":[31],"poor":[33],"prosody":[34,93,131],"similarity":[36],"performance":[37],"synthetic":[40],"speech.":[41],"In":[42],"this":[43],"paper,":[44],"we":[45],"present":[46],"bi-level":[48],"function":[49],"decoupling":[50],"realise":[53],"separate":[54,142],"modeling":[55,67],"controlling":[57,132],"for":[58],"solving":[59],"above":[60],"problems.":[61],"Firstly,":[62],"on":[63,99,113,146,158],"style":[65],"representation":[66],"level,":[68,117],"compared":[69],"with":[70],"conventional":[72],"methods":[73,153],"that":[74,88,150],"use":[75],"single":[76],"embedding":[77,91,94],"model":[79,115,120],"all":[80],"text":[82],"dependent":[83],"discrepancies,":[84],"it":[85],"proposed":[87,152],"speaker":[90,134],"are":[95,137],"modeled":[96],"separately":[97],"based":[98],"reference":[101],"audio":[102],"phonetic":[104],"posteriorgram":[105],"(PPG)":[106],"by":[107],"multi-head":[109],"attention":[110],"mechanism.":[111],"Secondly,":[112],"structure":[116,121],"decoder":[119],"factored":[123],"into":[124],"average-net":[125],"adaptation-net,":[127],"where":[128],"duration":[130],"timbre":[135],"imitation":[136],"mainly":[138],"designed":[139],"relatively":[141],"areas.":[143],"Experimental":[144],"results":[145],"Mandarin":[147],"dataset":[148],"show":[149],"lead":[154],"an":[156],"improvement":[157],"both":[159],"robustness,":[160],"naturalness":[161],"similarity.":[163]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
