{"id":"https://openalex.org/W3146962208","doi":"https://doi.org/10.1109/slt48900.2021.9383585","title":"Learn2Sing: Target Speaker Singing Voice Synthesis by Learning from a Singing Teacher","display_name":"Learn2Sing: Target Speaker Singing Voice Synthesis by Learning from a Singing Teacher","publication_year":2021,"publication_date":"2021-01-19","ids":{"openalex":"https://openalex.org/W3146962208","doi":"https://doi.org/10.1109/slt48900.2021.9383585","mag":"3146962208"},"language":"en","primary_location":{"id":"doi:10.1109/slt48900.2021.9383585","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt48900.2021.9383585","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064426059","display_name":"Heyang Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heyang Xue","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU), School of Software, Xian, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU), School of Software, Xian, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101736420","display_name":"Shan Yang","orcid":"https://orcid.org/0000-0003-4464-146X"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shan Yang","raw_affiliation_strings":["Northwestern Polytechnical University, Xian, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University, Xian, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013928267","display_name":"Yi Lei","orcid":"https://orcid.org/0000-0002-9256-9311"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Lei","raw_affiliation_strings":["Northwestern Polytechnical University, Xian, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University, Xian, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066245750","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-9051-2111"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU), School of Software, Xian, China","Northwestern Polytechnical University, Xian, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU), School of Software, Xian, China","institution_ids":[]},{"raw_affiliation_string":"Northwestern Polytechnical University, Xian, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101562197","display_name":"Xiulin Li","orcid":"https://orcid.org/0000-0003-3581-2968"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiulin Li","raw_affiliation_strings":["Databaker (Beijing) Technology Co., Ltd., Xian, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Databaker (Beijing) Technology Co., Ltd., Xian, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.2594,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.83358714,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"522","last_page":"529"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.9622465372085571},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7146887183189392},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6564251184463501},{"id":"https://openalex.org/keywords/lyrics","display_name":"Lyrics","score":0.5487575531005859},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.2164383828639984}],"concepts":[{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.9622465372085571},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7146887183189392},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6564251184463501},{"id":"https://openalex.org/C2776436406","wikidata":"https://www.wikidata.org/wiki/Q602446","display_name":"Lyrics","level":2,"score":0.5487575531005859},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.2164383828639984},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt48900.2021.9383585","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt48900.2021.9383585","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7400000095367432}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1579853615","https://openalex.org/W1731081199","https://openalex.org/W1882958252","https://openalex.org/W2102003408","https://openalex.org/W2402539796","https://openalex.org/W2515336442","https://openalex.org/W2794490148","https://openalex.org/W2795109282","https://openalex.org/W2889244839","https://openalex.org/W2921576841","https://openalex.org/W2937242376","https://openalex.org/W2963568710","https://openalex.org/W2963826681","https://openalex.org/W2963927338","https://openalex.org/W2964243274","https://openalex.org/W2971753973","https://openalex.org/W2981934523","https://openalex.org/W3019084079","https://openalex.org/W3036051869","https://openalex.org/W3095074555","https://openalex.org/W3097152652","https://openalex.org/W3133525064","https://openalex.org/W3142087749","https://openalex.org/W4295731579","https://openalex.org/W6634817459","https://openalex.org/W6637618735","https://openalex.org/W6639480849","https://openalex.org/W6750489868","https://openalex.org/W6761725420","https://openalex.org/W6767453231","https://openalex.org/W6776400185","https://openalex.org/W6779294058","https://openalex.org/W6782110587"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2360952181","https://openalex.org/W4310670065","https://openalex.org/W634160686","https://openalex.org/W2597614303","https://openalex.org/W3214861561","https://openalex.org/W2389838651","https://openalex.org/W2378183644","https://openalex.org/W2287414930"],"abstract_inverted_index":{"Singing":[0],"voice":[1,30,78,82,224],"synthesis":[2,14],"has":[3],"been":[4],"paid":[5],"rising":[6],"attention":[7],"with":[8],"the":[9,74,110,126,140,143,156,166,188,195,216],"rapid":[10],"development":[11],"of":[12,49,142,198,207,221],"speech":[13,92,231],"area.":[15],"In":[16,57,84,152],"general,":[17],"a":[18,27,38,54,69,87,100,148,169],"studio-level":[19],"singing":[20,29,70,77,81,89,106,162,196,208,223],"corpus":[21,39,90],"is":[22,40,121,219],"usually":[23],"necessary":[24],"to":[25,42,51,72,154,159,193],"produce":[26],"natural":[28],"from":[31,93,204],"lyrics":[32],"and":[33,91,107,114,172,209],"music-related":[34,123],"transcription.":[35],"However,":[36],"such":[37],"difficult":[41],"collect":[43],"since":[44,119],"it's":[45],"hard":[46],"for":[47,125,146,225],"many":[48],"us":[50],"sing":[52,160],"like":[53],"professional":[55],"singer.":[56],"this":[58],"paper,":[59],"we":[60,129,181],"propose":[61],"an":[62,136,173],"approach":[63,218],"-":[64],"Learn2Sing":[65],"that":[66,215],"only":[67,229],"needs":[68],"teacher":[71],"generate":[73],"target":[75,95,127,157,199,226],"speakers'":[76],"without":[79,161],"their":[80,230],"data.":[83,211],"our":[85],"approach,":[86],"teacher's":[88],"multiple":[94],"speakers":[96,200],"are":[97,177],"trained":[98],"in":[99,165,187],"frame-level":[101],"auto-regressive":[102],"acoustic":[103,144,189,205],"model":[104,145,171,176],"where":[105],"speaking":[108,210],"share":[109],"common":[111],"speaker":[112,158,227],"embedding":[113],"style":[115,203],"tag":[116],"embedding.":[117],"Meanwhile,":[118],"there":[120],"no":[122],"transcription":[124],"speaker,":[128],"use":[130],"log-scale":[131],"fundamental":[132],"frequency":[133],"(LF0)":[134],"as":[135,139],"auxiliary":[137],"feature":[138],"inputs":[141],"building":[147],"unified":[149],"input":[150],"representation.":[151],"order":[153],"enable":[155],"reference":[163],"audio":[164],"inference":[167],"stage,":[168],"duration":[170],"LF0":[174],"prediction":[175],"also":[178],"trained.":[179],"Particularly,":[180],"employ":[182],"domain":[183],"adversarial":[184],"training":[185],"(DAT)":[186],"model,":[190],"which":[191],"aims":[192],"enhance":[194],"performance":[197],"by":[201],"disentangling":[202],"features":[206],"Our":[212],"experiments":[213],"indicate":[214],"proposed":[217],"capable":[220],"synthesizing":[222],"given":[228],"samples.":[232]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
