{"id":"https://openalex.org/W4392931282","doi":"https://doi.org/10.1109/icassp48485.2024.10448495","title":"High-Fidelity Speech Synthesis with Minimal Supervision: All Using Diffusion Models","display_name":"High-Fidelity Speech Synthesis with Minimal Supervision: All Using Diffusion Models","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392931282","doi":"https://doi.org/10.1109/icassp48485.2024.10448495"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10448495","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448495","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5028353824","display_name":"Chunyu Qiang","orcid":"https://orcid.org/0009-0007-2290-3074"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]},{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chunyu Qiang","raw_affiliation_strings":["Tianjin University,School of New Media and Communication,Tianjin,China","School of New Media and Communication, Tianjin University, Tianjin, China","Kuaishou Technology Co., Ltd, Beijing, China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,School of New Media and Communication,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"School of New Media and Communication, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100348721","display_name":"Hao Li","orcid":"https://orcid.org/0009-0005-4319-9026"},"institutions":[{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]},{"id":"https://openalex.org/I4210155967","display_name":"OriginWater (China)","ror":"https://ror.org/04h7gmn81","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210155967"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Li","raw_affiliation_strings":["Kuaishou Technology Co., Ltd,Beijing,China","Kuaishou Technology Co., Ltd, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Kuaishou Technology Co., Ltd,Beijing,China","institution_ids":["https://openalex.org/I4210155967","https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100528371","display_name":"Yixin Tian","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]},{"id":"https://openalex.org/I4210155967","display_name":"OriginWater (China)","ror":"https://ror.org/04h7gmn81","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210155967"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yixin Tian","raw_affiliation_strings":["Kuaishou Technology Co., Ltd,Beijing,China","Kuaishou Technology Co., Ltd, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Kuaishou Technology Co., Ltd,Beijing,China","institution_ids":["https://openalex.org/I4210155967","https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081082934","display_name":"Yi Zhao","orcid":"https://orcid.org/0000-0003-4864-895X"},"institutions":[{"id":"https://openalex.org/I4210155967","display_name":"OriginWater (China)","ror":"https://ror.org/04h7gmn81","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210155967"]},{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Zhao","raw_affiliation_strings":["Kuaishou Technology Co., Ltd,Beijing,China","Kuaishou Technology Co., Ltd, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Kuaishou Technology Co., Ltd,Beijing,China","institution_ids":["https://openalex.org/I4210155967","https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100386277","display_name":"Ying Zhang","orcid":"https://orcid.org/0009-0000-9627-8768"},"institutions":[{"id":"https://openalex.org/I4210155967","display_name":"OriginWater (China)","ror":"https://ror.org/04h7gmn81","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210155967"]},{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Zhang","raw_affiliation_strings":["Kuaishou Technology Co., Ltd,Beijing,China","Kuaishou Technology Co., Ltd, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Kuaishou Technology Co., Ltd,Beijing,China","institution_ids":["https://openalex.org/I4210155967","https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101745213","display_name":"Longbiao Wang","orcid":"https://orcid.org/0000-0002-8094-6861"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longbiao Wang","raw_affiliation_strings":["Tianjin University,School of New Media and Communication,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","School of New Media and Communication, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,School of New Media and Communication,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"School of New Media and Communication, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017251198","display_name":"Jianwu Dang","orcid":"https://orcid.org/0000-0002-9237-4821"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwu Dang","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5028353824"],"corresponding_institution_ids":["https://openalex.org/I162868743","https://openalex.org/I4401726859"],"apc_list":null,"apc_paid":null,"fwci":1.3264,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.82475732,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"10781","last_page":"10785"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7177659273147583},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5925261378288269},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5866662263870239},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.5429655313491821},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.48913314938545227},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.4676365852355957},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.4607992470264435},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4275129437446594},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.32454994320869446},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.12217533588409424}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7177659273147583},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5925261378288269},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5866662263870239},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.5429655313491821},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.48913314938545227},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.4676365852355957},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.4607992470264435},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4275129437446594},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32454994320869446},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.12217533588409424},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10448495","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448495","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":39,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W2519091744","https://openalex.org/W2752782242","https://openalex.org/W2903739847","https://openalex.org/W2946200149","https://openalex.org/W2963609956","https://openalex.org/W3026874504","https://openalex.org/W3036167779","https://openalex.org/W3036601975","https://openalex.org/W3161296985","https://openalex.org/W3209059054","https://openalex.org/W3215615641","https://openalex.org/W4226033575","https://openalex.org/W4254489317","https://openalex.org/W4296068981","https://openalex.org/W4307323391","https://openalex.org/W4312069022","https://openalex.org/W4313679638","https://openalex.org/W4319779739","https://openalex.org/W4323651091","https://openalex.org/W4366460484","https://openalex.org/W4372260402","https://openalex.org/W4381786045","https://openalex.org/W4382603054","https://openalex.org/W4390075359","https://openalex.org/W4392903524","https://openalex.org/W4392909624","https://openalex.org/W4402301063","https://openalex.org/W6631190155","https://openalex.org/W6734815144","https://openalex.org/W6763832098","https://openalex.org/W6777694618","https://openalex.org/W6779823529","https://openalex.org/W6780218876","https://openalex.org/W6848735303","https://openalex.org/W6850334629","https://openalex.org/W6851724922","https://openalex.org/W6853888607","https://openalex.org/W6860417749"],"related_works":["https://openalex.org/W2530685530","https://openalex.org/W4375868962","https://openalex.org/W2011227383","https://openalex.org/W2088854863","https://openalex.org/W4402568167","https://openalex.org/W3179495260","https://openalex.org/W1976719989","https://openalex.org/W3127543252","https://openalex.org/W2065606036","https://openalex.org/W2016904525"],"abstract_inverted_index":{"Text-to-speech":[0],"(TTS)":[1],"methods":[2,48],"have":[3],"shown":[4],"promising":[5],"results":[6,178],"in":[7,56,63,144],"voice":[8],"cloning,":[9],"but":[10],"they":[11],"require":[12],"a":[13,93],"large":[14],"number":[15],"of":[16,29,138,172],"labeled":[17],"text-speech":[18],"pairs.":[19],"Minimally-supervised":[20],"speech":[21,31,96],"synthesis":[22,97],"decouples":[23],"TTS":[24],"by":[25,83,163],"combining":[26],"two":[27,37],"types":[28],"discrete":[30,64],"representations(semantic":[32],"&":[33],"acoustic)":[34],"and":[35,53,59,72,114,141,158],"using":[36],"sequence-to-sequence":[38],"tasks":[39,167],"to":[40,134,168],"enable":[41],"training":[42],"with":[43],"minimal":[44],"supervision.":[45],"However,":[46],"existing":[47,145],"suffer":[49,78],"from":[50,79],"information":[51,139],"redundancy":[52,140],"dimension":[54,142],"explosion":[55,143],"semantic":[57,132,146,157],"representation,":[58],"high-frequency":[60,173],"waveform":[61,175],"distortion":[62],"acoustic":[65,154,159],"representation.":[66,155],"Autoregressive":[67],"frameworks":[68,77],"exhibit":[69],"typical":[70],"instability":[71],"uncontrollability":[73],"issues.":[74],"And":[75],"non-autoregressive":[76,110],"prosodic":[80,121],"averaging":[81],"caused":[82],"duration":[84,116],"prediction":[85],"models.":[86,108],"To":[87],"address":[88],"these":[89],"issues,":[90],"we":[91],"propose":[92],"minimally-supervised":[94],"high-fidelity":[95],"method,":[98],"where":[99],"all":[100],"modules":[101],"are":[102,161],"constructed":[103],"based":[104],"on":[105,192],"the":[106,115,136,153,170,185],"diffusion":[107,117],"The":[109],"framework":[111],"enhances":[112],"controllability,":[113],"model":[118],"enables":[119],"diversified":[120],"expression.":[122],"Contrastive":[123],"Token-Acoustic":[124],"Pretraining":[125],"(CTAP)":[126],"is":[127,150],"used":[128,151],"as":[129,152],"an":[130],"intermediate":[131],"representation":[133],"solve":[135,169],"problems":[137],"coding":[147],"methods.":[148],"Mel-spectrogram":[149],"Both":[156],"representations":[160],"predicted":[162],"continuous":[164],"variable":[165],"regression":[166],"problem":[171],"fine-grained":[174],"distortion.":[176],"Experimental":[177],"show":[179],"that":[180],"our":[181,193],"proposed":[182],"method":[183],"outperforms":[184],"baseline":[186],"method.":[187],"We":[188],"provide":[189],"audio":[190],"samples":[191],"website.":[194],"<sup":[195],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[196],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[197]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
