{"id":"https://openalex.org/W3135864795","doi":"https://doi.org/10.1109/iscslp49672.2021.9362077","title":"Exploring Cross-lingual Singing Voice Synthesis Using Speech Data","display_name":"Exploring Cross-lingual Singing Voice Synthesis Using Speech Data","publication_year":2021,"publication_date":"2021-01-24","ids":{"openalex":"https://openalex.org/W3135864795","doi":"https://doi.org/10.1109/iscslp49672.2021.9362077","mag":"3135864795"},"language":"en","primary_location":{"id":"doi:10.1109/iscslp49672.2021.9362077","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscslp49672.2021.9362077","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 12th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013296368","display_name":"Yuewen Cao","orcid":"https://orcid.org/0000-0002-1432-161X"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yuewen Cao","raw_affiliation_strings":["The Chinese University of Hong Kong"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102778496","display_name":"Songxiang Liu","orcid":"https://orcid.org/0000-0002-0943-2446"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Songxiang Liu","raw_affiliation_strings":["The Chinese University of Hong Kong"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083463839","display_name":"Shiyin Kang","orcid":"https://orcid.org/0000-0001-8304-5260"},"institutions":[{"id":"https://openalex.org/I68926175","display_name":"Hoya (Japan)","ror":"https://ror.org/049vpfq31","country_code":"JP","type":"company","lineage":["https://openalex.org/I68926175"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shiyin Kang","raw_affiliation_strings":["Huya Inc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huya Inc","institution_ids":["https://openalex.org/I68926175"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100685461","display_name":"Na Hu","orcid":"https://orcid.org/0000-0001-6362-0969"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Na Hu","raw_affiliation_strings":["Tencent AI Lab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent AI Lab","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100346924","display_name":"Peng Liu","orcid":"https://orcid.org/0009-0008-7485-5745"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Liu","raw_affiliation_strings":["Tencent AI Lab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent AI Lab","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037109470","display_name":"Xunying Liu","orcid":"https://orcid.org/0000-0001-6725-1160"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Xunying Liu","raw_affiliation_strings":["The Chinese University of Hong Kong"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075183307","display_name":"Dan Su","orcid":"https://orcid.org/0000-0001-5746-9545"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dan Su","raw_affiliation_strings":["Tencent AI Lab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent AI Lab","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034476404","display_name":"Dong Yu","orcid":"https://orcid.org/0000-0003-0520-6844"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dong Yu","raw_affiliation_strings":["Tencent AI Lab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent AI Lab","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5019458385","display_name":"Helen Meng","orcid":"https://orcid.org/0000-0002-4427-3532"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Helen Meng","raw_affiliation_strings":["The Chinese University of Hong Kong"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.4198,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.67876303,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7960259914398193},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7902661561965942},{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.708755373954773},{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.6582913398742676},{"id":"https://openalex.org/keywords/mandarin-chinese","display_name":"Mandarin Chinese","score":0.6196452379226685},{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.5505936741828918},{"id":"https://openalex.org/keywords/active-listening","display_name":"Active listening","score":0.4988741874694824},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4838201403617859},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.4336254298686981},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3662527799606323},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3230127692222595},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.16244667768478394},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.1197705864906311},{"id":"https://openalex.org/keywords/communication","display_name":"Communication","score":0.11895117163658142},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.10390949249267578}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7960259914398193},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7902661561965942},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.708755373954773},{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.6582913398742676},{"id":"https://openalex.org/C138954614","wikidata":"https://www.wikidata.org/wiki/Q9192","display_name":"Mandarin Chinese","level":2,"score":0.6196452379226685},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.5505936741828918},{"id":"https://openalex.org/C177291462","wikidata":"https://www.wikidata.org/wiki/Q423038","display_name":"Active listening","level":2,"score":0.4988741874694824},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4838201403617859},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4336254298686981},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3662527799606323},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3230127692222595},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.16244667768478394},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.1197705864906311},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.11895117163658142},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.10390949249267578},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iscslp49672.2021.9362077","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscslp49672.2021.9362077","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 12th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8199999928474426,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W155077236","https://openalex.org/W1570629387","https://openalex.org/W2005289882","https://openalex.org/W2066598518","https://openalex.org/W2527729766","https://openalex.org/W2778460379","https://openalex.org/W2963568710","https://openalex.org/W2963609956","https://openalex.org/W2963964591","https://openalex.org/W2964002616","https://openalex.org/W2964243274","https://openalex.org/W2964307104","https://openalex.org/W2971753973","https://openalex.org/W2972473628","https://openalex.org/W2972910332","https://openalex.org/W2973046048","https://openalex.org/W2984106626","https://openalex.org/W2995670387","https://openalex.org/W3015437531","https://openalex.org/W3015499232","https://openalex.org/W3015645837","https://openalex.org/W3015844196","https://openalex.org/W3016250102","https://openalex.org/W3019084079","https://openalex.org/W3035430139","https://openalex.org/W3036051869","https://openalex.org/W3081279708","https://openalex.org/W3095074555","https://openalex.org/W3097514409","https://openalex.org/W3133525064","https://openalex.org/W4298580827","https://openalex.org/W6606312984","https://openalex.org/W6746700228","https://openalex.org/W6748409065","https://openalex.org/W6767453231","https://openalex.org/W6772134836","https://openalex.org/W6776400185"],"related_works":["https://openalex.org/W4391272374","https://openalex.org/W1914543332","https://openalex.org/W2946856121","https://openalex.org/W40885451","https://openalex.org/W2108985546","https://openalex.org/W2081919107","https://openalex.org/W2433276473","https://openalex.org/W1537411440","https://openalex.org/W2535215250","https://openalex.org/W2024201202"],"abstract_inverted_index":{"State-of-the-art":[0],"singing":[1,9,58,142,156],"voice":[2,10,59,143],"synthesis":[3],"(SVS)":[4],"models":[5],"can":[6,53,140],"generate":[7,141],"natural":[8],"of":[11,37],"a":[12,34,48,80,85,88,93],"target":[13,39],"speaker,":[14],"given":[15],"his/her":[16],"speaking/singing":[17],"data":[18,32,69,157,162],"in":[19,33,60,70],"the":[20,38,120,135],"same":[21],"language.":[22],"However,":[23],"there":[24],"may":[25],"be":[26],"challenging":[27],"conditions":[28],"where":[29],"only":[30,66],"speech":[31,68,161],"non-target":[35],"language":[36,110],"speaker":[40,107,114,125,149],"is":[41,116],"available.":[42],"In":[43],"this":[44],"paper,":[45],"we":[46],"present":[47],"cross-lingual":[49,74,89,137],"SVS":[50,75,138],"system":[51,76,139],"that":[52,134,154],"synthesize":[54],"an":[55],"English":[56],"speaker's":[57],"Mandarin":[61],"from":[62,123],"musical":[63],"scores":[64],"with":[65,144],"her":[67],"English.":[71],"The":[72,96],"pro-posed":[73],"contains":[77],"four":[78],"parts:":[79],"BLSTM":[81],"based":[82],"duration":[83],"model,":[84,87],"pitch":[86,169],"acoustic":[90,97],"model":[91,98],"and":[92,109,129,147,168],"neural":[94],"vocoder.":[95],"employs":[99],"encoder-decoder":[100],"architecture":[101],"conditioned":[102],"on":[103,166],"pitch,":[104],"phoneme":[105],"duration,":[106],"information":[108],"information.":[111,126],"An":[112],"adversarially-trained":[113],"classifier":[115],"employed":[117],"to":[118],"discourage":[119],"text":[121],"encodings":[122],"capturing":[124],"Objective":[127],"evaluation":[128],"subjective":[130],"listening":[131],"tests":[132],"demonstrate":[133],"proposed":[136],"decent":[145],"naturalness":[146],"fair":[148],"similarity.":[150],"We":[151],"also":[152],"find":[153],"adding":[155],"or":[158],"multi-speaker":[159],"monolingual":[160],"further":[163],"improves":[164],"generalization":[165],"pronunciation":[167],"accuracy.":[170]},"counts_by_year":[{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
