{"id":"https://openalex.org/W4402979375","doi":"https://doi.org/10.1109/icme57554.2024.10687605","title":"CosDiff: Code-Switching TTS Model Based on A Multi-Task DDIM","display_name":"CosDiff: Code-Switching TTS Model Based on A Multi-Task DDIM","publication_year":2024,"publication_date":"2024-07-15","ids":{"openalex":"https://openalex.org/W4402979375","doi":"https://doi.org/10.1109/icme57554.2024.10687605"},"language":"en","primary_location":{"id":"doi:10.1109/icme57554.2024.10687605","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme57554.2024.10687605","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5115598124","display_name":"Ke Chen","orcid":"https://orcid.org/0009-0005-3069-9769"},"institutions":[{"id":"https://openalex.org/I96908189","display_name":"Xinjiang University","ror":"https://ror.org/059gw8r13","country_code":"CN","type":"education","lineage":["https://openalex.org/I96908189"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ke Chen","raw_affiliation_strings":["Xinjiang University,School of Computer Science and Technology,Urumqi,China"],"affiliations":[{"raw_affiliation_string":"Xinjiang University,School of Computer Science and Technology,Urumqi,China","institution_ids":["https://openalex.org/I96908189"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056424666","display_name":"Zhihua Huang","orcid":"https://orcid.org/0000-0001-5710-5231"},"institutions":[{"id":"https://openalex.org/I96908189","display_name":"Xinjiang University","ror":"https://ror.org/059gw8r13","country_code":"CN","type":"education","lineage":["https://openalex.org/I96908189"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhihua Huang","raw_affiliation_strings":["Xinjiang University,Xinjiang Key Laboratory of Signal Detection and Processing,Urumqi,China"],"affiliations":[{"raw_affiliation_string":"Xinjiang University,Xinjiang Key Laboratory of Signal Detection and Processing,Urumqi,China","institution_ids":["https://openalex.org/I96908189"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088334916","display_name":"Kexin Lu","orcid":"https://orcid.org/0009-0004-8154-4882"},"institutions":[{"id":"https://openalex.org/I96908189","display_name":"Xinjiang University","ror":"https://ror.org/059gw8r13","country_code":"CN","type":"education","lineage":["https://openalex.org/I96908189"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kexin Lu","raw_affiliation_strings":["Xinjiang University,School of Computer Science and Technology,Urumqi,China"],"affiliations":[{"raw_affiliation_string":"Xinjiang University,School of Computer Science and Technology,Urumqi,China","institution_ids":["https://openalex.org/I96908189"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100425112","display_name":"Yonghong Yan","orcid":"https://orcid.org/0000-0001-6907-5770"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210099069","display_name":"Institute of Acoustics","ror":"https://ror.org/00v8rqv75","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210099069"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yonghong Yan","raw_affiliation_strings":["Key Laboratory of Speech Acoustics and Content Understanding Institute of Acoustics,Chinese Academy of Sciences,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Speech Acoustics and Content Understanding Institute of Acoustics,Chinese Academy of Sciences,Beijing,China","institution_ids":["https://openalex.org/I4210099069","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5115598124"],"corresponding_institution_ids":["https://openalex.org/I96908189"],"apc_list":null,"apc_paid":null,"fwci":1.4504,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.85055008,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.8766999840736389,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.8766999840736389,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.8409000039100647,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.8051999807357788,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7984541058540344},{"id":"https://openalex.org/keywords/code-switching","display_name":"Code-switching","score":0.7199704051017761},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5355588793754578},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.44057387113571167},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.39827296137809753},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.32058489322662354},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08727654814720154},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.07008224725723267}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7984541058540344},{"id":"https://openalex.org/C18552078","wikidata":"https://www.wikidata.org/wiki/Q255615","display_name":"Code-switching","level":2,"score":0.7199704051017761},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5355588793754578},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.44057387113571167},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.39827296137809753},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32058489322662354},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08727654814720154},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.07008224725723267},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme57554.2024.10687605","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme57554.2024.10687605","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2519091744","https://openalex.org/W2903739847","https://openalex.org/W2936832667","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2973084242","https://openalex.org/W2996952407","https://openalex.org/W3015212790","https://openalex.org/W3016139610","https://openalex.org/W3095012670","https://openalex.org/W3095361818","https://openalex.org/W3095873922","https://openalex.org/W3161695192","https://openalex.org/W3198213150","https://openalex.org/W3210530853","https://openalex.org/W4226332109","https://openalex.org/W4304099317","https://openalex.org/W4372260486","https://openalex.org/W6777694618","https://openalex.org/W6778823374","https://openalex.org/W6779823529","https://openalex.org/W6783182287","https://openalex.org/W6783713337","https://openalex.org/W6783867762","https://openalex.org/W6795261426","https://openalex.org/W6802142237","https://openalex.org/W6809884996","https://openalex.org/W6852811595","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W2771594921","https://openalex.org/W2432897346","https://openalex.org/W2181336723","https://openalex.org/W4389976243","https://openalex.org/W3158134258","https://openalex.org/W3138119129","https://openalex.org/W2940588741","https://openalex.org/W2293063924","https://openalex.org/W2974240475","https://openalex.org/W4388441038"],"abstract_inverted_index":{"Although":[0],"existing":[1],"Text-To-Speech":[2],"(TTS)":[3],"synthesizers":[4],"are":[5],"able":[6],"to":[7,98],"generate":[8,42],"high-quality":[9],"speech":[10,43],"in":[11,146],"most":[12],"cases,":[13],"their":[14],"overall":[15],"performance":[16,96],"is":[17],"still":[18],"affected":[19],"by":[20],"the":[21,24,80,99,102,121,133,141,148,157],"distribution":[22],"of":[23,101,118,143,150],"training":[25],"data.":[26],"When":[27],"processing":[28],"tasks":[29],"that":[30,44],"involve":[31],"complex":[32],"data":[33,123],"distributions,":[34],"such":[35],"as":[36],"code-switching":[37,94],"TTS,":[38],"these":[39],"models":[40],"might":[41],"sounds":[45],"unnatural":[46],"or":[47],"has":[48],"low":[49],"speaker":[50,103],"similarity.":[51],"In":[52,113],"this":[53,144],"paper,":[54],"we":[55,83,115],"propose":[56],"CosDiff,":[57],"a":[58,64,85,92],"Code-Switching":[59],"TTS":[60,77],"model":[61],"based":[62],"on":[63],"multi-task":[65],"Denoising":[66],"Diffusion":[67],"Implicit":[68],"Model":[69],"(DDIM),":[70],"which":[71,106],"integrates":[72],"Voice":[73],"Conversion":[74],"(VC)":[75],"and":[76,127,155],"functionalities.":[78],"Utilizing":[79],"VC":[81],"function,":[82],"construct":[84],"single-speaker":[86,110],"bilingual":[87],"dataset":[88],"for":[89],"training,":[90],"achieving":[91],"superior":[93],"synthesis":[95],"compared":[97],"outcomes":[100],"encoder":[104],"method,":[105],"trains":[107],"with":[108],"multiple":[109],"monolingual":[111],"datasets.":[112],"addition,":[114],"employ":[116],"strategies":[117],"directly":[119],"predicting":[120],"clean":[122],"x<inf":[124],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[125],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">0</inf>":[126],"progressive":[128],"diffusion":[129],"distillation,":[130],"further":[131],"accelerating":[132],"model\u2019s":[134],"sampling":[135,153],"process.":[136],"The":[137],"experimental":[138],"results":[139],"demonstrate":[140],"efficacy":[142],"method":[145],"improving":[147],"quality":[149],"generation,":[151],"increasing":[152],"speed,":[154],"distilling":[156],"model.":[158]},"counts_by_year":[{"year":2025,"cited_by_count":4}],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
