{"id":"https://openalex.org/W4392904355","doi":"https://doi.org/10.1109/icassp48485.2024.10446750","title":"An Experimental Comparison of Noise-Robust Text-To-Speech Synthesis Systems Based On Self-Supervised Representation","display_name":"An Experimental Comparison of Noise-Robust Text-To-Speech Synthesis Systems Based On Self-Supervised Representation","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392904355","doi":"https://doi.org/10.1109/icassp48485.2024.10446750"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446750","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446750","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052168680","display_name":"Xiaoying Zhao","orcid":"https://orcid.org/0000-0001-7071-3901"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiaoying Zhao","raw_affiliation_strings":["University of Science and Technology of China (USTC),Hefei,China","University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045966396","display_name":"Qiushi Zhu","orcid":"https://orcid.org/0000-0002-1196-7781"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiushi Zhu","raw_affiliation_strings":["University of Science and Technology of China (USTC),Hefei,China","University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5074544822","display_name":"Yu\u2010Chen Hu","orcid":"https://orcid.org/0000-0002-5055-3645"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yuchen Hu","raw_affiliation_strings":["Nanyang Technological University,Singapore","Nanyang Technological University, Singapore"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University,Singapore","institution_ids":["https://openalex.org/I172675005"]},{"raw_affiliation_string":"Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5052168680"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.7252,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.73058304,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"11441","last_page":"11445"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7905575037002563},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7701252698898315},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.713108241558075},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.598323404788971},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5835475325584412},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.498063325881958},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4817066788673401},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4815438389778137},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.4145240783691406},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3796638250350952},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.07706382870674133}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7905575037002563},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7701252698898315},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.713108241558075},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.598323404788971},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5835475325584412},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.498063325881958},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4817066788673401},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4815438389778137},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.4145240783691406},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3796638250350952},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.07706382870674133},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446750","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446750","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W2020024436","https://openalex.org/W2033875152","https://openalex.org/W2066452495","https://openalex.org/W2514828952","https://openalex.org/W2603567530","https://openalex.org/W2801554275","https://openalex.org/W2907262790","https://openalex.org/W2972359262","https://openalex.org/W3015213852","https://openalex.org/W3029282897","https://openalex.org/W3037038648","https://openalex.org/W3133525064","https://openalex.org/W3163906773","https://openalex.org/W3205533980","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4221140371","https://openalex.org/W4221146604","https://openalex.org/W4226380987","https://openalex.org/W4226390724","https://openalex.org/W4376456759","https://openalex.org/W4385822534","https://openalex.org/W4386273222","https://openalex.org/W6755300632","https://openalex.org/W6763832098","https://openalex.org/W6778215197","https://openalex.org/W6778823374","https://openalex.org/W6780218876","https://openalex.org/W6783867762","https://openalex.org/W6810007534","https://openalex.org/W6855932338"],"related_works":["https://openalex.org/W2081900870","https://openalex.org/W2560215812","https://openalex.org/W2949601986","https://openalex.org/W2788972299","https://openalex.org/W2498789492","https://openalex.org/W2521347458","https://openalex.org/W2037549926","https://openalex.org/W2729981612","https://openalex.org/W2345479200","https://openalex.org/W3134175397"],"abstract_inverted_index":{"With":[0],"the":[1,17,36,47,50,66,87,94,129,137,153,159,162],"advance":[2],"in":[3,20,61],"deep":[4],"learning,":[5],"text-to-speech":[6],"(TTS)":[7],"using":[8,117],"clean":[9],"speech":[10,38,84],"has":[11],"witnessed":[12],"significant":[13],"performance":[14,95],"improvements.":[15],"As":[16],"data":[18],"collected":[19],"real":[21],"scenes":[22],"often":[23],"contain":[24],"noise":[25,59,73,138],"and":[26,42,122,139,155],"thus":[27],"needs":[28],"to":[29,72,135],"be":[30],"denoised,":[31],"TTS":[32,116],"models":[33,56],"trained":[34],"on":[35,93,152],"enhanced":[37],"suffer":[39],"from":[40],"distortions":[41],"residual":[43],"noises,":[44],"which":[45],"affect":[46],"quality":[48],"of":[49,89,107,161],"synthesized":[51],"speech.":[52],"Meanwhile,":[53],"self-supervised":[54,91,108],"pre-trained":[55],"exhibit":[57],"good":[58],"robustness":[60],"many":[62],"tasks,":[63],"indicating":[64],"that":[65,78],"learned":[67],"representation":[68],"is":[69],"more":[70],"tolerant":[71],"perturbations.":[74],"It":[75],"was":[76],"shown":[77],"WavLM-based":[79],"representations":[80,92,109,130],"are":[81,168],"noise-robust":[82],"for":[83,115,147],"synthesis,":[85],"but":[86],"impact":[88],"different":[90],"remains":[96],"unknown.":[97],"In":[98],"this":[99],"paper,":[100],"we":[101,142],"therefore":[102],"experimentally":[103],"compare":[104],"four":[105],"types":[106],"(e.g.,":[110],"WavLM,":[111],"Wav2vec2.0,":[112],"HuBERT,":[113],"Data2vec)":[114],"a":[118,123,132],"HiFi-GAN-based":[119],"representation-to-waveform":[120],"vocoder":[121],"Fastspeech-based":[124],"text-to-representation":[125],"acoustic":[126],"model.":[127],"Since":[128],"have":[131],"better":[133],"capacity":[134],"suppress":[136],"speaker":[140,145],"clues,":[141],"further":[143],"integrate":[144],"embedding":[146],"voice":[148],"conversion.":[149],"Experimental":[150],"results":[151],"LJSpeech":[154],"LibriTTS":[156],"datasets":[157],"demonstrate":[158],"efficacy":[160],"proposed":[163],"method.":[164],"Some":[165],"audio":[166],"samples":[167],"available":[169],"at:":[170],"https://zxyzqs.github.io/.":[171]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
