{"id":"https://openalex.org/W3133525064","doi":"https://doi.org/10.1109/iscslp49672.2021.9362104","title":"ByteSing: A Chinese Singing Voice Synthesis System Using Duration Allocated Encoder-Decoder Acoustic Models and WaveRNN Vocoders","display_name":"ByteSing: A Chinese Singing Voice Synthesis System Using Duration Allocated Encoder-Decoder Acoustic Models and WaveRNN Vocoders","publication_year":2021,"publication_date":"2021-01-24","ids":{"openalex":"https://openalex.org/W3133525064","doi":"https://doi.org/10.1109/iscslp49672.2021.9362104","mag":"3133525064"},"language":"en","primary_location":{"id":"doi:10.1109/iscslp49672.2021.9362104","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscslp49672.2021.9362104","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 12th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100749025","display_name":"Yu Gu","orcid":"https://orcid.org/0000-0003-3634-2275"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yu Gu","raw_affiliation_strings":["ByteDance AI Lab"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069496916","display_name":"Xiang Yin","orcid":"https://orcid.org/0000-0002-6554-1516"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiang Yin","raw_affiliation_strings":["ByteDance AI Lab"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059814193","display_name":"Yonghui Rao","orcid":"https://orcid.org/0009-0001-4530-9357"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yonghui Rao","raw_affiliation_strings":["ByteDance AI Lab"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101137823","display_name":"Yuan Wan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan Wan","raw_affiliation_strings":["ByteDance AI Lab"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031898944","display_name":"Benlai Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Benlai Tang","raw_affiliation_strings":["ByteDance AI Lab"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100354733","display_name":"Yang Zhang","orcid":"https://orcid.org/0000-0002-8540-1254"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang Zhang","raw_affiliation_strings":["ByteDance AI Lab"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046063537","display_name":"Jitong Chen","orcid":"https://orcid.org/0000-0001-6084-043X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jitong Chen","raw_affiliation_strings":["ByteDance AI Lab"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100375986","display_name":"Yuxuan Wang","orcid":"https://orcid.org/0000-0002-5743-2029"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuxuan Wang","raw_affiliation_strings":["ByteDance AI Lab"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110851569","display_name":"Zejun Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zejun Ma","raw_affiliation_strings":["ByteDance AI Lab"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5100749025"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":8.1786,"has_fulltext":false,"cited_by_count":57,"citation_normalized_percentile":{"value":0.98320284,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9914000034332275,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.8336870670318604},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8220568895339966},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7200190424919128},{"id":"https://openalex.org/keywords/duration","display_name":"Duration (music)","score":0.6470118761062622},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6442725658416748},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5260417461395264},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.46970507502555847},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.4364163279533386},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.35182154178619385},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.17329749464988708},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.079263836145401}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.8336870670318604},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8220568895339966},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7200190424919128},{"id":"https://openalex.org/C112758219","wikidata":"https://www.wikidata.org/wiki/Q16038819","display_name":"Duration (music)","level":2,"score":0.6470118761062622},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6442725658416748},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5260417461395264},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.46970507502555847},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.4364163279533386},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35182154178619385},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.17329749464988708},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.079263836145401},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iscslp49672.2021.9362104","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscslp49672.2021.9362104","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 12th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W29794711","https://openalex.org/W960890183","https://openalex.org/W1810943226","https://openalex.org/W2043003570","https://openalex.org/W2408435475","https://openalex.org/W2515336442","https://openalex.org/W2519091744","https://openalex.org/W2567070169","https://openalex.org/W2766812927","https://openalex.org/W2889244839","https://openalex.org/W2921576841","https://openalex.org/W2937242376","https://openalex.org/W2946200149","https://openalex.org/W2949382160","https://openalex.org/W2963300588","https://openalex.org/W2963609956","https://openalex.org/W2963970792","https://openalex.org/W2964243274","https://openalex.org/W2964307104","https://openalex.org/W2970730223","https://openalex.org/W2971753973","https://openalex.org/W2972910332","https://openalex.org/W2973046048","https://openalex.org/W2995670387","https://openalex.org/W3015499232","https://openalex.org/W3015922793","https://openalex.org/W3096437652","https://openalex.org/W4298580827","https://openalex.org/W6601203246","https://openalex.org/W6625166683","https://openalex.org/W6638273328","https://openalex.org/W6731370813","https://openalex.org/W6745697700","https://openalex.org/W6748409065","https://openalex.org/W6761725420","https://openalex.org/W6763832098","https://openalex.org/W6767453231","https://openalex.org/W6769685950","https://openalex.org/W6769754352","https://openalex.org/W6771539914","https://openalex.org/W6772134836"],"related_works":["https://openalex.org/W4313443006","https://openalex.org/W2945374968","https://openalex.org/W4293777179","https://openalex.org/W4385452045","https://openalex.org/W2164070813","https://openalex.org/W2135608140","https://openalex.org/W2895525995","https://openalex.org/W2332512904","https://openalex.org/W4224231624","https://openalex.org/W2319626700"],"abstract_inverted_index":{"This":[0],"paper":[1,114],"presents":[2],"ByteSing,":[3],"a":[4],"Chinese":[5],"singing":[6],"voice":[7,95],"synthesis":[8],"(SVS)":[9],"system":[10],"based":[11],"on":[12],"duration":[13,60],"allocated":[14],"Tacotron-like":[15,32],"acoustic":[16,37],"models":[17,43,133],"and":[18,44,53,79,102,120,127,131],"WaveRNN":[19,83],"neural":[20,46,89],"vocoders.":[21],"Different":[22],"from":[23],"the":[24,28,36,41,67,73,94,108,125,132],"conventional":[25],"SVS":[26,109],"models,":[27,38],"proposed":[29,111],"ByteSing":[30],"employs":[31],"encoder-decoder":[33],"structures":[34],"as":[35,51,88],"in":[39,112],"which":[40,70],"CBHG":[42],"recurrent":[45],"networks":[47],"(RNNs)":[48],"are":[49,85],"explored":[50],"encoders":[52],"decoders":[54],"respectively.":[55],"Meanwhile":[56],"an":[57],"auxiliary":[58],"phoneme":[59],"prediction":[61,81,129],"model":[62,74,77],"is":[63],"utilized":[64],"to":[65,91],"expand":[66],"input":[68],"sequence,":[69],"can":[71,115,137],"enhance":[72],"controllable":[75],"capacity,":[76],"stability":[78],"tempo":[80],"accuracy.":[82],"vocoders":[84,90],"also":[86],"adopted":[87],"further":[92],"improve":[93],"quality":[96],"of":[97],"synthesized":[98],"songs.":[99],"Both":[100],"objective":[101],"subjective":[103],"experimental":[104],"results":[105],"prove":[106],"that":[107],"method":[110],"this":[113],"produce":[116],"quite":[117],"natural,":[118],"expressive":[119],"high-fidelity":[121],"songs":[122],"by":[123],"improving":[124],"pitch":[126],"spectrogram":[128],"accuracy":[130],"using":[134],"attention":[135],"mechanism":[136],"achieve":[138],"best":[139],"performance.":[140]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":13},{"year":2023,"cited_by_count":12},{"year":2022,"cited_by_count":14},{"year":2021,"cited_by_count":14},{"year":2020,"cited_by_count":1}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
