{"id":"https://openalex.org/W4319586988","doi":"https://doi.org/10.1109/iscslp57327.2022.10037822","title":"Rhythm-controllable Attention with High Robustness for Long Sentence Speech Synthesis","display_name":"Rhythm-controllable Attention with High Robustness for Long Sentence Speech Synthesis","publication_year":2022,"publication_date":"2022-12-11","ids":{"openalex":"https://openalex.org/W4319586988","doi":"https://doi.org/10.1109/iscslp57327.2022.10037822"},"language":"en","primary_location":{"id":"doi:10.1109/iscslp57327.2022.10037822","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/iscslp57327.2022.10037822","pdf_url":null,"source":{"id":"https://openalex.org/S4363607181","display_name":"2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2306.02593","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102845007","display_name":"Dengfeng Ke","orcid":"https://orcid.org/0000-0001-8459-0412"},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dengfeng Ke","raw_affiliation_strings":["Beijing Language and Culture University,Beijing,China","Beijing Language and Culture University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University,Beijing,China","institution_ids":["https://openalex.org/I115212828"]},{"raw_affiliation_string":"Beijing Language and Culture University, Beijing, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008597430","display_name":"Yayue Deng","orcid":"https://orcid.org/0009-0003-7642-4942"},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]},{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yayue Deng","raw_affiliation_strings":["Beijing Language and Culture University,Beijing,China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","Beijing Language and Culture University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University,Beijing,China","institution_ids":["https://openalex.org/I115212828"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"Beijing Language and Culture University, Beijing, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002689662","display_name":"Yukang Jia","orcid":null},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yukang Jia","raw_affiliation_strings":["Beijing Language and Culture University,Beijing,China","Beijing Language and Culture University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University,Beijing,China","institution_ids":["https://openalex.org/I115212828"]},{"raw_affiliation_string":"Beijing Language and Culture University, Beijing, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033565985","display_name":"Jinlong Xue","orcid":"https://orcid.org/0009-0000-0442-0932"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinlong Xue","raw_affiliation_strings":["Beijing University of Posts and Telecommunications,School of Artificial Intelligence,Beijing,China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications,School of Artificial Intelligence,Beijing,China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057554364","display_name":"Qi Luo","orcid":"https://orcid.org/0009-0000-8290-5441"},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qi Luo","raw_affiliation_strings":["Beijing Language and Culture University,Beijing,China","Beijing Language and Culture University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University,Beijing,China","institution_ids":["https://openalex.org/I115212828"]},{"raw_affiliation_string":"Beijing Language and Culture University, Beijing, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100343662","display_name":"Ya Li","orcid":"https://orcid.org/0000-0002-6284-5039"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ya Li","raw_affiliation_strings":["Beijing University of Posts and Telecommunications,School of Artificial Intelligence,Beijing,China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications,School of Artificial Intelligence,Beijing,China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047888605","display_name":"Jianqing Sun","orcid":"https://orcid.org/0009-0007-3598-8564"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jianqing Sun","raw_affiliation_strings":["Unisound AI Technology Co., Ltd,Beijing,China","Unisound AI Technology Co., Ltd, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Unisound AI Technology Co., Ltd,Beijing,China","institution_ids":[]},{"raw_affiliation_string":"Unisound AI Technology Co., Ltd, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110832899","display_name":"Jiaen Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiaen Liang","raw_affiliation_strings":["Unisound AI Technology Co., Ltd,Beijing,China","Unisound AI Technology Co., Ltd, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Unisound AI Technology Co., Ltd,Beijing,China","institution_ids":[]},{"raw_affiliation_string":"Unisound AI Technology Co., Ltd, Beijing, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101084840","display_name":"Binghuai Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Binghuai Lin","raw_affiliation_strings":["Tencent Technology Co., Ltd,Smart Platform Product Department,China","Smart Platform Product Department, Tencent Technology Co., Ltd, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent Technology Co., Ltd,Smart Platform Product Department,China","institution_ids":["https://openalex.org/I2250653659"]},{"raw_affiliation_string":"Smart Platform Product Department, Tencent Technology Co., Ltd, China","institution_ids":["https://openalex.org/I2250653659"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.1038,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.36231116,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"33","issue":null,"first_page":"220","last_page":"224"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.8740063905715942},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.8633021712303162},{"id":"https://openalex.org/keywords/rhythm","display_name":"Rhythm","score":0.8222982883453369},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6686582565307617},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6508129835128784},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.582534670829773},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4902541935443878},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.325153648853302},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.07787764072418213}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.8740063905715942},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.8633021712303162},{"id":"https://openalex.org/C135343436","wikidata":"https://www.wikidata.org/wiki/Q170406","display_name":"Rhythm","level":2,"score":0.8222982883453369},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6686582565307617},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6508129835128784},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.582534670829773},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4902541935443878},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.325153648853302},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.07787764072418213},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/iscslp57327.2022.10037822","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/iscslp57327.2022.10037822","pdf_url":null,"source":{"id":"https://openalex.org/S4363607181","display_name":"2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2306.02593","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2306.02593","pdf_url":"https://arxiv.org/pdf/2306.02593","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2306.02593","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2306.02593","pdf_url":"https://arxiv.org/pdf/2306.02593","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.4099999964237213,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[{"id":"https://openalex.org/G6685334198","display_name":null,"funder_award_id":"202200042","funder_id":"https://openalex.org/F4320326873","funder_display_name":"National Laboratory of Pattern Recognition"},{"id":"https://openalex.org/G7257301138","display_name":null,"funder_award_id":"2021RC37","funder_id":"https://openalex.org/F4320321470","funder_display_name":"Beijing University of Posts and Telecommunications"}],"funders":[{"id":"https://openalex.org/F4320321470","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59"},{"id":"https://openalex.org/F4320326873","display_name":"National Laboratory of Pattern Recognition","ror":null}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4319586988.pdf"},"referenced_works_count":37,"referenced_works":["https://openalex.org/W854541894","https://openalex.org/W1522301498","https://openalex.org/W1810943226","https://openalex.org/W2133564696","https://openalex.org/W2519091744","https://openalex.org/W2605141709","https://openalex.org/W2747874407","https://openalex.org/W2770743791","https://openalex.org/W2794490148","https://openalex.org/W2886769154","https://openalex.org/W2915977493","https://openalex.org/W2947723875","https://openalex.org/W2964243274","https://openalex.org/W2970006822","https://openalex.org/W2972702018","https://openalex.org/W3015282541","https://openalex.org/W3026874504","https://openalex.org/W3033411150","https://openalex.org/W3095545636","https://openalex.org/W3128910262","https://openalex.org/W3195077602","https://openalex.org/W3198791321","https://openalex.org/W3207354624","https://openalex.org/W4286693788","https://openalex.org/W4287241477","https://openalex.org/W4294619417","https://openalex.org/W6623517193","https://openalex.org/W6631190155","https://openalex.org/W6679434410","https://openalex.org/W6735706088","https://openalex.org/W6746238782","https://openalex.org/W6747158283","https://openalex.org/W6750489868","https://openalex.org/W6767111847","https://openalex.org/W6777694618","https://openalex.org/W6778823374","https://openalex.org/W6790220310"],"related_works":["https://openalex.org/W1914543332","https://openalex.org/W2946856121","https://openalex.org/W2108985546","https://openalex.org/W2081919107","https://openalex.org/W2433276473","https://openalex.org/W1537411440","https://openalex.org/W1984347656","https://openalex.org/W2535215250","https://openalex.org/W2024201202","https://openalex.org/W2049083033"],"abstract_inverted_index":{"Regressive":[0],"Text-to-Speech":[1],"(TTS)":[2],"system":[3],"utilizes":[4],"attention":[5,35,73],"mechanism":[6],"to":[7,48,84,115,151,157],"generate":[8,49],"alignment":[9],"between":[10],"text":[11],"and":[12,26,28,69,94,111,121],"acoustic":[13],"feature":[14],"sequence.":[15],"Alignment":[16],"determines":[17],"synthesis":[18,40],"robustness":[19,68,117],"(e.g,":[20],"the":[21,132,152,163],"occurence":[22],"of":[23,82,118,123,137],"skipping,":[24],"repeating,":[25],"collapse)":[27],"rhythm":[29,43,86,90],"via":[30],"duration":[31,46],"control.":[32],"However,":[33],"current":[34],"algorithms":[36],"used":[37],"in":[38],"speech":[39,51,153],"cannot":[41],"control":[42,91],"using":[44],"external":[45],"information":[47,83],"natural":[50,167],"while":[52],"ensuring":[53],"robustness.":[54],"In":[55],"this":[56],"study,":[57],"we":[58],"propose":[59],"Rhythm-controllable":[60],"Attention":[61],"(RC-Attention)":[62],"based":[63],"on":[64],"Tracotron2,":[65],"which":[66,88],"improves":[67],"naturalness":[70,122],"simultaneously.":[71],"Proposed":[72],"adopts":[74],"a":[75],"trainable":[76],"scalar":[77],"learned":[78],"from":[79],"four":[80],"kinds":[81],"achieve":[85],"control,":[87],"makes":[89],"more":[92,166],"robust":[93],"natural,":[95],"even":[96],"when":[97],"synthesized":[98,124,154],"sentences":[99],"are":[100],"extremely":[101],"longer":[102],"than":[103],"training":[104],"corpus.":[105],"We":[106],"use":[107],"word":[108,134],"errors":[109],"counting":[110],"AB":[112],"preference":[113],"test":[114],"measure":[116],"proposed":[119],"method":[120],"speech,":[125],"respectively.":[126],"Results":[127],"shows":[128],"that":[129,158],"RC-Attention":[130,156],"has":[131,165],"lowest":[133],"error":[135],"rate":[136],"nearly":[138,147],"0.6%,":[139],"compared":[140],"with":[141,155,159],"11.8%":[142],"for":[143],"baseline":[144],"system.":[145],"Moreover,":[146],"60%":[148],"subjects":[149],"prefer":[150],"Forward":[160],"Attention,":[161],"because":[162],"former":[164],"rhythm.":[168]},"counts_by_year":[{"year":2022,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
