{"id":"https://openalex.org/W4392903524","doi":"https://doi.org/10.1109/icassp48485.2024.10446203","title":"Minimally-Supervised Speech Synthesis with Conditional Diffusion Model and Language Model: A Comparative Study of Semantic Coding","display_name":"Minimally-Supervised Speech Synthesis with Conditional Diffusion Model and Language Model: A Comparative Study of Semantic Coding","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903524","doi":"https://doi.org/10.1109/icassp48485.2024.10446203"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446203","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446203","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5028353824","display_name":"Chunyu Qiang","orcid":"https://orcid.org/0009-0007-2290-3074"},"institutions":[{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]},{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chunyu Qiang","raw_affiliation_strings":["Tianjin University,School of New Media and Communication,Tianjin,China","Kuaishou Technology Co., Ltd, Beijing, China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","School of New Media and Communication, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,School of New Media and Communication,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"School of New Media and Communication, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100348721","display_name":"Hao Li","orcid":"https://orcid.org/0009-0005-4319-9026"},"institutions":[{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]},{"id":"https://openalex.org/I4210155967","display_name":"OriginWater (China)","ror":"https://ror.org/04h7gmn81","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210155967"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Li","raw_affiliation_strings":["Kuaishou Technology Co., Ltd,Beijing,China","Kuaishou Technology Co., Ltd, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Kuaishou Technology Co., Ltd,Beijing,China","institution_ids":["https://openalex.org/I4210155967","https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029516423","display_name":"Hao Ni","orcid":"https://orcid.org/0000-0001-5485-4376"},"institutions":[{"id":"https://openalex.org/I4210155967","display_name":"OriginWater (China)","ror":"https://ror.org/04h7gmn81","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210155967"]},{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Ni","raw_affiliation_strings":["Kuaishou Technology Co., Ltd,Beijing,China","Kuaishou Technology Co., Ltd, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Kuaishou Technology Co., Ltd,Beijing,China","institution_ids":["https://openalex.org/I4210155967","https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101319176","display_name":"He Qu","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]},{"id":"https://openalex.org/I4210155967","display_name":"OriginWater (China)","ror":"https://ror.org/04h7gmn81","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210155967"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"He Qu","raw_affiliation_strings":["Kuaishou Technology Co., Ltd,Beijing,China","Kuaishou Technology Co., Ltd, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Kuaishou Technology Co., Ltd,Beijing,China","institution_ids":["https://openalex.org/I4210155967","https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073918837","display_name":"Ruibo Fu","orcid":"https://orcid.org/0000-0001-9598-1881"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruibo Fu","raw_affiliation_strings":["Chinese Academy of Sciences,Institute of Automation,Beijing,China","Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100453420","display_name":"Tao Wang","orcid":"https://orcid.org/0000-0001-8099-9704"},"institutions":[{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Wang","raw_affiliation_strings":["Chinese Academy of Sciences,Institute of Automation,Beijing,China","Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101745213","display_name":"Longbiao Wang","orcid":"https://orcid.org/0000-0002-8094-6861"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longbiao Wang","raw_affiliation_strings":["Tianjin University,School of New Media and Communication,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","School of New Media and Communication, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,School of New Media and Communication,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"School of New Media and Communication, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017251198","display_name":"Jianwu Dang","orcid":"https://orcid.org/0000-0002-9237-4821"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwu Dang","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5028353824"],"corresponding_institution_ids":["https://openalex.org/I162868743","https://openalex.org/I4401726859"],"apc_list":null,"apc_paid":null,"fwci":4.3108,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.94698438,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"10186","last_page":"10190"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7518690824508667},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.6123467683792114},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6047452092170715},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5521537065505981},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4766681492328644},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.4761892855167389},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.4754880666732788},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4659247100353241},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.44602739810943604},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.19453495740890503},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.12699934840202332}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7518690824508667},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.6123467683792114},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6047452092170715},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5521537065505981},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4766681492328644},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.4761892855167389},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.4754880666732788},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4659247100353241},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.44602739810943604},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.19453495740890503},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.12699934840202332},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446203","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446203","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.550000011920929}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W2519091744","https://openalex.org/W2591927543","https://openalex.org/W2752782242","https://openalex.org/W2903739847","https://openalex.org/W2946200149","https://openalex.org/W2963609956","https://openalex.org/W3026874504","https://openalex.org/W3036167779","https://openalex.org/W3036601975","https://openalex.org/W3094002217","https://openalex.org/W3161296985","https://openalex.org/W3209059054","https://openalex.org/W3215615641","https://openalex.org/W4292779060","https://openalex.org/W4296068981","https://openalex.org/W4307323391","https://openalex.org/W4312069022","https://openalex.org/W4313679638","https://openalex.org/W4323651091","https://openalex.org/W4366460484","https://openalex.org/W4372260402","https://openalex.org/W4381786045","https://openalex.org/W4382603054","https://openalex.org/W4390075359","https://openalex.org/W4402301063","https://openalex.org/W6763832098","https://openalex.org/W6777694618","https://openalex.org/W6778883912","https://openalex.org/W6779823529","https://openalex.org/W6780218876","https://openalex.org/W6847363464"],"related_works":["https://openalex.org/W3013693939","https://openalex.org/W2159052453","https://openalex.org/W2566616303","https://openalex.org/W3131327266","https://openalex.org/W2734887215","https://openalex.org/W4297051394","https://openalex.org/W2752972570","https://openalex.org/W4386815338","https://openalex.org/W2053531689","https://openalex.org/W1842536210"],"abstract_inverted_index":{"Recently,":[0],"there":[1],"has":[2],"been":[3],"a":[4,97,113,124,130,134,146,157,170,205],"growing":[5],"interest":[6],"in":[7,59,64],"text-to-speech":[8],"(TTS)":[9],"methods":[10,36,85,199],"that":[11,155,179,196],"can":[12],"be":[13],"trained":[14],"with":[15,207],"minimal":[16],"supervision":[17],"by":[18,54],"combining":[19],"two":[20,28],"types":[21],"of":[22,45,74,96,150,174,183],"discrete":[23,46],"speech":[24,47],"representations":[25],"and":[26,62,71,100,133,188],"using":[27],"sequence-to-sequence":[29],"tasks":[30],"to":[31,67,116,137,161],"decouple":[32],"TTS.":[33],"However,":[34],"existing":[35,75,184],"suffer":[37],"from":[38],"three":[39,83,175],"problems:":[40],"the":[41,49,55,68,105,109,181,190],"high-frequency":[42],"waveform":[43],"distortion":[44],"representations,":[48],"prosodic":[50,164],"averaging":[51],"problem":[52],"caused":[53],"duration":[56,158],"prediction":[57,65],"model":[58,99,115,160],"non-autoregressive":[60,147,171],"frameworks,":[61],"difficulty":[63],"due":[66],"information":[69],"redundancy":[70],"dimension":[72],"explosion":[73],"semantic":[76,106,185],"coding":[77,186],"methods.":[78,202],"To":[79],"address":[80],"these":[81],"problems,":[82],"progressive":[84],"are":[86],"proposed.":[87],"First,":[88],"we":[89,143,167],"propose":[90,144,168],"Diff-LM-Speech,":[91],"an":[92],"autoregressive":[93],"structure":[94,127,148,172],"consisting":[95,149,173],"language":[98],"diffusion":[101,114,152,159,176],"models,":[102],"which":[103],"models":[104,187],"embedding":[107],"into":[108],"mel-spectrogram":[110],"based":[111,128],"on":[112,129],"achieve":[117,162,189],"higher":[118],"audio":[119,208],"quality.":[120],"We":[121,203],"also":[122],"introduce":[123],"prompt":[125,139],"encoder":[126],"variational":[131],"autoencoder":[132],"prosody":[135],"bottleneck":[136],"improve":[138],"representation":[140],"ability.":[141],"Second,":[142],"Tetra-Diff-Speech,":[145],"four":[151],"model-based":[153,177],"modules":[154,178],"design":[156],"diverse":[163],"expressions.":[165],"Finally,":[166],"Tri-Diff-Speech,":[169],"verify":[180],"non-necessity":[182],"best":[191],"results.":[192],"Experimental":[193],"results":[194],"show":[195],"our":[197],"proposed":[198],"outperform":[200],"baseline":[201],"provide":[204],"website":[206],"samples.":[209],"<sup":[210],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[211],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[212]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":5}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
