{"id":"https://openalex.org/W3186829543","doi":"https://doi.org/10.1145/3459212.3459222","title":"A Transformer-based Chinese Non-autoregressive Speech Synthesis Scheme","display_name":"A Transformer-based Chinese Non-autoregressive Speech Synthesis Scheme","publication_year":2021,"publication_date":"2021-03-19","ids":{"openalex":"https://openalex.org/W3186829543","doi":"https://doi.org/10.1145/3459212.3459222","mag":"3186829543"},"language":"en","primary_location":{"id":"doi:10.1145/3459212.3459222","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3459212.3459222","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 3rd International Conference on Image, Video and Signal Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012144978","display_name":"Yueqing Cai","orcid":null},"institutions":[{"id":"https://openalex.org/I196699116","display_name":"Wuhan University of Technology","ror":"https://ror.org/03fe7t173","country_code":"CN","type":"education","lineage":["https://openalex.org/I196699116"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yueqing Cai","raw_affiliation_strings":["Wuhan University of Technology, China"],"affiliations":[{"raw_affiliation_string":"Wuhan University of Technology, China","institution_ids":["https://openalex.org/I196699116"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5076793177","display_name":"Wenbi Rao","orcid":"https://orcid.org/0000-0003-1465-1544"},"institutions":[{"id":"https://openalex.org/I196699116","display_name":"Wuhan University of Technology","ror":"https://ror.org/03fe7t173","country_code":"CN","type":"education","lineage":["https://openalex.org/I196699116"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenbi Rao","raw_affiliation_strings":["Wuhan University of Technology, China"],"affiliations":[{"raw_affiliation_string":"Wuhan University of Technology, China","institution_ids":["https://openalex.org/I196699116"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5012144978"],"corresponding_institution_ids":["https://openalex.org/I196699116"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10124245,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"59","last_page":"64"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9902999997138977,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7685675024986267},{"id":"https://openalex.org/keywords/discriminator","display_name":"Discriminator","score":0.7451057434082031},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.7401389479637146},{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.7272895574569702},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.6745152473449707},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6743552684783936},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.6255437731742859},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5679176449775696},{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.5249649882316589},{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.5116817951202393},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39190787076950073},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09217393398284912}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7685675024986267},{"id":"https://openalex.org/C2779803651","wikidata":"https://www.wikidata.org/wiki/Q5282088","display_name":"Discriminator","level":3,"score":0.7451057434082031},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.7401389479637146},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.7272895574569702},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.6745152473449707},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6743552684783936},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.6255437731742859},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5679176449775696},{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.5249649882316589},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.5116817951202393},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39190787076950073},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09217393398284912},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C94915269","wikidata":"https://www.wikidata.org/wiki/Q1834857","display_name":"Detector","level":2,"score":0.0},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3459212.3459222","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3459212.3459222","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 3rd International Conference on Image, Video and Signal Processing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.7400000095367432,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W2747874407","https://openalex.org/W2903739847","https://openalex.org/W2963403868","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2970730223","https://openalex.org/W3015338123"],"related_works":["https://openalex.org/W4391272374","https://openalex.org/W1914543332","https://openalex.org/W2946856121","https://openalex.org/W40885451","https://openalex.org/W2108985546","https://openalex.org/W2081919107","https://openalex.org/W2433276473","https://openalex.org/W1537411440","https://openalex.org/W1984347656","https://openalex.org/W2535215250"],"abstract_inverted_index":{"At":[0],"present,":[1],"the":[2,7,27,36,39,57,68,83,149],"main":[3],"research":[4],"hotspot":[5],"in":[6],"field":[8],"of":[9,31,38,60,131,142],"speech":[10,15,23,96],"synthesis":[11,24,97],"is":[12,145],"still":[13],"English":[14],"synthesis,":[16],"and":[17,46,52,70,107],"there":[18,47],"are":[19],"few":[20],"non-autoregressive":[21],"Chinese":[22,28],"models.":[25],"During":[26],"migration":[29],"process":[30],"FastSpeech2,":[32],"we":[33,64,99,111],"found":[34],"that":[35,120,130],"naturalness":[37],"synthesized":[40],"audio":[41,79],"was":[42],"not":[43],"good":[44],"enough":[45],"were":[48],"some":[49],"abnormal":[50],"interruptions":[51],"incorrect":[53],"pronunciation.":[54],"Inspired":[55],"by":[56],"training":[58],"method":[59],"generative":[61],"adversarial":[62],"network,":[63],"use":[65],"FastSpeech2":[66,76],"as":[67],"generator,":[69],"add":[71],"a":[72,90,101,124,138],"discriminator":[73],"to":[74,77,82,88,93],"force":[75],"generate":[78],"more":[80],"similar":[81],"real":[84],"audio.":[85],"In":[86],"order":[87],"realize":[89],"complete":[91],"text":[92],"Mel":[94,126],"spectrogram":[95],"scheme,":[98],"design":[100],"text-to-phoneme":[102],"converter":[103],"based":[104],"on":[105,114],"corpus":[106],"rule":[108],"constraints.":[109],"And":[110,133],"conduct":[112],"experiments":[113],"Baker":[115],"dataset.":[116],"The":[117],"results":[118],"show":[119],"our":[121,134],"model":[122,135],"achieves":[123],"better":[125,147],"Cepstral":[127],"Distance":[128],"than":[129,148],"FastSpeech2.":[132],"can":[136],"achieve":[137],"mean":[139],"opinion":[140],"score":[141],"3.94,":[143],"which":[144],"slightly":[146],"original":[150],"model.":[151]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
