{"id":"https://openalex.org/W4402389607","doi":"https://doi.org/10.1109/ialp63756.2024.10661192","title":"A Mel Spectrogram Enhancement Paradigm Based on CWT in Speech Synthesis","display_name":"A Mel Spectrogram Enhancement Paradigm Based on CWT in Speech Synthesis","publication_year":2024,"publication_date":"2024-08-04","ids":{"openalex":"https://openalex.org/W4402389607","doi":"https://doi.org/10.1109/ialp63756.2024.10661192"},"language":"en","primary_location":{"id":"doi:10.1109/ialp63756.2024.10661192","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ialp63756.2024.10661192","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 International Conference on Asian Language Processing (IALP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025777969","display_name":"Guoqiang Hu","orcid":"https://orcid.org/0000-0001-9333-1498"},"institutions":[{"id":"https://openalex.org/I159948400","display_name":"Jinan University","ror":"https://ror.org/02xe5ns62","country_code":"CN","type":"education","lineage":["https://openalex.org/I159948400"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guoqiang Hu","raw_affiliation_strings":["Jinan University,International School,Guangzhou,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Jinan University,International School,Guangzhou,China","institution_ids":["https://openalex.org/I159948400"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023849838","display_name":"Huaning Tan","orcid":"https://orcid.org/0000-0003-0999-2125"},"institutions":[{"id":"https://openalex.org/I159948400","display_name":"Jinan University","ror":"https://ror.org/02xe5ns62","country_code":"CN","type":"education","lineage":["https://openalex.org/I159948400"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huaning Tan","raw_affiliation_strings":["Jinan University,International School,Guangzhou,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Jinan University,International School,Guangzhou,China","institution_ids":["https://openalex.org/I159948400"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5111234378","display_name":"Ruilai Li","orcid":null},"institutions":[{"id":"https://openalex.org/I159948400","display_name":"Jinan University","ror":"https://ror.org/02xe5ns62","country_code":"CN","type":"education","lineage":["https://openalex.org/I159948400"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruilai Li","raw_affiliation_strings":["Jinan University,International School,Guangzhou,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Jinan University,International School,Guangzhou,China","institution_ids":["https://openalex.org/I159948400"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I159948400"],"apc_list":null,"apc_paid":null,"fwci":1.4974,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.82902542,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"401","last_page":"405"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11447","display_name":"Blind Source Separation Techniques","score":0.9801999926567078,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.9368535280227661},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.686412513256073},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6334314942359924},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.5645434856414795},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3279265761375427},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.05233725905418396}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.9368535280227661},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.686412513256073},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6334314942359924},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.5645434856414795},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3279265761375427},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.05233725905418396}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ialp63756.2024.10661192","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ialp63756.2024.10661192","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 International Conference on Asian Language Processing (IALP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1975163393","https://openalex.org/W2066452495","https://openalex.org/W2067530554","https://openalex.org/W2100921845","https://openalex.org/W2120847449","https://openalex.org/W2145892079","https://openalex.org/W2152328854","https://openalex.org/W2395380967","https://openalex.org/W2919115771","https://openalex.org/W2950673314","https://openalex.org/W2970675500","https://openalex.org/W2970737019","https://openalex.org/W2997430147","https://openalex.org/W3030286867","https://openalex.org/W3141797743","https://openalex.org/W4307323391","https://openalex.org/W4313679638","https://openalex.org/W4367840320","https://openalex.org/W4378473976","https://openalex.org/W4392903159","https://openalex.org/W6754005058","https://openalex.org/W6772357204","https://openalex.org/W6848735303","https://openalex.org/W6852500804","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2530685530","https://openalex.org/W4375868962","https://openalex.org/W2011227383","https://openalex.org/W2897924318","https://openalex.org/W2138997758","https://openalex.org/W3096184950","https://openalex.org/W4231424160","https://openalex.org/W2275432853"],"abstract_inverted_index":{"Acoustic":[0],"features":[1],"play":[2],"an":[3,79,143],"important":[4],"role":[5],"in":[6,24,50,110,175],"improving":[7],"the":[8,11,15,31,40,71,89,95,100,127,131,134,151,161,164,170,173],"quality":[9],"of":[10,42,145,163,172],"synthesised":[12,44,129],"speech.":[13],"Currently,":[14],"Mel":[16,46,60,65,96,135],"spectrogram":[17,47,66,97,136],"is":[18,48],"a":[19,57,64,82],"widely":[20],"employed":[21],"acoustic":[22,26],"feature":[23],"most":[25],"models.":[27],"However,":[28],"due":[29],"to":[30,55,112,150],"fine-grained":[32],"loss":[33],"caused":[34],"by":[35,45,99],"its":[36],"Fourier":[37],"transform":[38,74],"process,":[39],"clarity":[41],"speech":[43,119,128],"compromised":[49],"mutant":[51],"signals.":[52],"In":[53],"order":[54,111],"obtain":[56],"more":[58,83],"detailed":[59,84],"spectrogram,":[61,86],"we":[62],"propose":[63],"enhancement":[67,137,165],"paradigm":[68,77,138,174],"based":[69],"on":[70],"continuous":[72],"wavelet":[73,85],"(CWT).":[75],"This":[76],"introduces":[78],"additional":[80],"task:":[81],"which":[87],"like":[88],"post-processing":[90],"network":[91],"takes":[92],"as":[93,167],"input":[94],"output":[98],"decoder.":[101],"We":[102],"choose":[103],"Tacotron2":[104],"and":[105,116,147],"Fastspeech2":[106],"for":[107,160],"experimental":[108,123],"validation":[109,159],"test":[113],"autoregressive":[114],"(AR)":[115],"non-autoregressive":[117],"(NAR)":[118],"systems,":[120],"respectively.":[121,154],"The":[122],"results":[124],"demonstrate":[125,169],"that":[126],"using":[130],"model":[132],"with":[133,142],"exhibits":[139],"higher":[140],"MOS,":[141],"improvement":[144],"0.14":[146],"0.09":[148],"compared":[149],"baseline":[152],"model,":[153],"These":[155],"findings":[156],"provide":[157],"some":[158],"universality":[162],"paradigm,":[166],"they":[168],"success":[171],"different":[176],"architectures.":[177]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2025-10-10T00:00:00"}
