{"id":"https://openalex.org/W4402978831","doi":"https://doi.org/10.1109/taslp.2024.3453598","title":"Cross-Utterance Conditioned VAE for Speech Generation","display_name":"Cross-Utterance Conditioned VAE for Speech Generation","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4402978831","doi":"https://doi.org/10.1109/taslp.2024.3453598"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3453598","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3453598","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://research.manchester.ac.uk/en/publications/89865bbf-197b-45be-a262-d45346ccf49d","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101457256","display_name":"Yang Li","orcid":"https://orcid.org/0009-0002-7073-9698"},"institutions":[{"id":"https://openalex.org/I28407311","display_name":"University of Manchester","ror":"https://ror.org/027m9bs27","country_code":"GB","type":"education","lineage":["https://openalex.org/I28407311"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Yang Li","raw_affiliation_strings":["Department of Computer Science, The University of Manchester, Manchester, U.K"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, The University of Manchester, Manchester, U.K","institution_ids":["https://openalex.org/I28407311"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111346880","display_name":"Cheng Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cheng Yu","raw_affiliation_strings":["School of Creativity and Art, ShanghaiTech University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Creativity and Art, ShanghaiTech University, Shanghai, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077772423","display_name":"Guangzhi Sun","orcid":"https://orcid.org/0000-0002-5886-056X"},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Guangzhi Sun","raw_affiliation_strings":["Machine Intelligence Lab, University of Cambridge, Cambridge, U.K"],"affiliations":[{"raw_affiliation_string":"Machine Intelligence Lab, University of Cambridge, Cambridge, U.K","institution_ids":["https://openalex.org/I241749"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059080410","display_name":"Weiqin Zu","orcid":"https://orcid.org/0009-0002-9745-4457"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiqin Zu","raw_affiliation_strings":["School of Creativity and Art, ShanghaiTech University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Creativity and Art, ShanghaiTech University, Shanghai, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101703045","display_name":"Zheng Tian","orcid":"https://orcid.org/0009-0008-0622-8512"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zheng Tian","raw_affiliation_strings":["School of Creativity and Art, ShanghaiTech University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Creativity and Art, ShanghaiTech University, Shanghai, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104546839","display_name":"Ying Wen","orcid":"https://orcid.org/0000-0003-1247-2382"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Wen","raw_affiliation_strings":["School of Electronic, Information and Electrical Engineering (SEIEE), Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic, Information and Electrical Engineering (SEIEE), Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100721259","display_name":"Wei Pan","orcid":"https://orcid.org/0000-0003-1121-9879"},"institutions":[{"id":"https://openalex.org/I28407311","display_name":"University of Manchester","ror":"https://ror.org/027m9bs27","country_code":"GB","type":"education","lineage":["https://openalex.org/I28407311"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wei Pan","raw_affiliation_strings":["Department of Computer Science, The University of Manchester, Manchester, U.K"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, The University of Manchester, Manchester, U.K","institution_ids":["https://openalex.org/I28407311"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100460206","display_name":"Chao Zhang","orcid":"https://orcid.org/0000-0002-7730-5131"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Zhang","raw_affiliation_strings":["Department of Electronic Engineering, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Electronic Engineering, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100384727","display_name":"Jun Wang","orcid":"https://orcid.org/0000-0002-4021-4228"},"institutions":[{"id":"https://openalex.org/I45129253","display_name":"University College London","ror":"https://ror.org/02jx3x895","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I45129253"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jun Wang","raw_affiliation_strings":["Department of Computer Science, University College London, London, U.K"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University College London, London, U.K","institution_ids":["https://openalex.org/I45129253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100397725","display_name":"Yang Yang","orcid":"https://orcid.org/0000-0003-0608-9408"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang Yang","raw_affiliation_strings":["Thrust of Internet of Things, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"Thrust of Internet of Things, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5063874810","display_name":"Fanglei Sun","orcid":"https://orcid.org/0000-0002-4302-2512"},"institutions":[{"id":"https://openalex.org/I148128674","display_name":"University of Shanghai for Science and Technology","ror":"https://ror.org/00ay9v204","country_code":"CN","type":"education","lineage":["https://openalex.org/I148128674"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fanglei Sun","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Shanghai for Science and Technology, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Shanghai for Science and Technology, Shanghai, China","institution_ids":["https://openalex.org/I148128674"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5101457256"],"corresponding_institution_ids":["https://openalex.org/I28407311"],"apc_list":null,"apc_paid":null,"fwci":1.7178,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.87032912,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":100},"biblio":{"volume":"32","issue":null,"first_page":"4263","last_page":"4276"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9775999784469604,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9735999703407288,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.8010676503181458},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4632298946380615},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.43891388177871704},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4080680310726166},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.3390195965766907},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.329639732837677},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.32214978337287903},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.09744647145271301}],"concepts":[{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.8010676503181458},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4632298946380615},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.43891388177871704},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4080680310726166},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.3390195965766907},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.329639732837677},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32214978337287903},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.09744647145271301}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/taslp.2024.3453598","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3453598","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:pure.atira.dk:openaire/89865bbf-197b-45be-a262-d45346ccf49d","is_oa":true,"landing_page_url":"https://research.manchester.ac.uk/en/publications/89865bbf-197b-45be-a262-d45346ccf49d","pdf_url":null,"source":{"id":"https://openalex.org/S4306400662","display_name":"Research Explorer (The University of Manchester)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I28407311","host_organization_name":"University of Manchester","host_organization_lineage":["https://openalex.org/I28407311"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Li, Y, Yu, C, Sun, G, Zu, W, Tian, Z, Wen, Y, Pan, W, Zhang, C, Wang, J, Yang, Y & Sun, F 2024, 'Cross-Utterance Conditioned VAE for Speech Generation', IEEE/ACM Transactions on Audio Speech and Language Processing, vol. 32, pp. 4263-4276. https://doi.org/10.1109/TASLP.2024.3453598","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-145549","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-145549","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":{"id":"pmh:oai:pure.atira.dk:openaire/89865bbf-197b-45be-a262-d45346ccf49d","is_oa":true,"landing_page_url":"https://research.manchester.ac.uk/en/publications/89865bbf-197b-45be-a262-d45346ccf49d","pdf_url":null,"source":{"id":"https://openalex.org/S4306400662","display_name":"Research Explorer (The University of Manchester)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I28407311","host_organization_name":"University of Manchester","host_organization_lineage":["https://openalex.org/I28407311"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Li, Y, Yu, C, Sun, G, Zu, W, Tian, Z, Wen, Y, Pan, W, Zhang, C, Wang, J, Yang, Y & Sun, F 2024, 'Cross-Utterance Conditioned VAE for Speech Generation', IEEE/ACM Transactions on Audio Speech and Language Processing, vol. 32, pp. 4263-4276. https://doi.org/10.1109/TASLP.2024.3453598","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.5099999904632568,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W2107740512","https://openalex.org/W2107860279","https://openalex.org/W2250539671","https://openalex.org/W2737697117","https://openalex.org/W2896457183","https://openalex.org/W2907262790","https://openalex.org/W2953719307","https://openalex.org/W2962691331","https://openalex.org/W2962850167","https://openalex.org/W2964138190","https://openalex.org/W2964243274","https://openalex.org/W2972359262","https://openalex.org/W2973217961","https://openalex.org/W3015282541","https://openalex.org/W3015440759","https://openalex.org/W3016021263","https://openalex.org/W3016136182","https://openalex.org/W3081488690","https://openalex.org/W3150572638","https://openalex.org/W3163338468","https://openalex.org/W3163339651","https://openalex.org/W3197294703","https://openalex.org/W3197324626","https://openalex.org/W3197632081","https://openalex.org/W3197957589","https://openalex.org/W3198152857","https://openalex.org/W3198213150","https://openalex.org/W4221142789","https://openalex.org/W4221159457","https://openalex.org/W4225596771","https://openalex.org/W4280604450","https://openalex.org/W4385822410"],"related_works":["https://openalex.org/W2529301793","https://openalex.org/W2384121599","https://openalex.org/W2038083449","https://openalex.org/W3177678247","https://openalex.org/W1999617572","https://openalex.org/W2944572343","https://openalex.org/W2333799855","https://openalex.org/W2351687372","https://openalex.org/W2004087835","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Speech":[0],"synthesis":[1,33,107,193],"systems":[2],"powered":[3],"by":[4],"neural":[5],"networks":[6],"hold":[7],"promise":[8],"for":[9,104,111,116],"multimedia":[10],"production,":[11],"but":[12],"frequently":[13],"face":[14],"issues":[15],"with":[16,133],"producing":[17,157,196],"expressive":[18,200],"speech":[19,32,43,106,117,168,192],"and":[20,40,56,80,113,164,177,194,199],"seamless":[21],"editing.":[22,118],"In":[23],"response,":[24],"we":[25],"present":[26],"the":[27,48,57,68,73,127,141,144,182],"Cross-Utterance":[28],"Conditioned":[29],"Variational":[30],"Autoencoder":[31],"(CUC-VAE":[34],"S2)":[35],"framework":[36,46,71],"to":[37,86,130],"enhance":[38,191],"prosody":[39,95,135],"ensure":[41],"natural":[42,198],"generation.":[44,96],"This":[45],"leverages":[47,148],"powerful":[49],"representational":[50],"capabilities":[51],"of":[52,60,67,126],"pre-trained":[53],"language":[54],"models":[55,189],"re-expression":[58],"abilities":[59],"variational":[61],"autoencoders":[62],"(VAEs).":[63],"The":[64,119],"core":[65],"component":[66],"CUC-VAE":[69,109,114,120,145],"S2":[70],"is":[72,122],"cross-utterance":[74],"CVAE,":[75],"which":[76],"extracts":[77],"acoustic,":[78],"speaker,":[79],"textual":[81],"features":[82],"from":[83,137],"surrounding":[84,138],"sentences":[85],"generate":[87,131],"context-sensitive":[88],"prosodic":[89],"features,":[90],"more":[91,197],"accurately":[92],"emulating":[93],"human":[94],"We":[97],"further":[98],"propose":[99],"two":[100],"practical":[101],"algorithms":[102],"tailored":[103],"distinct":[105],"applications:":[108],"TTS":[110,121],"text-to-speech":[112],"SE":[115,146],"a":[123],"direct":[124],"application":[125],"framework,":[128],"designed":[129],"audio":[132,158],"contextual":[134,155],"derived":[136],"texts.":[139],"On":[140],"other":[142],"hand,":[143],"algorithm":[147],"real":[149,162],"mel":[150],"spectrogram":[151],"sampling":[152],"conditioned":[153],"on":[154,171,181],"information,":[156],"that":[159,186],"closely":[160],"mirrors":[161],"sound":[163],"thereby":[165],"facilitating":[166],"flexible":[167],"editing":[169],"based":[170],"text":[172],"such":[173],"as":[174],"deletion,":[175],"insertion,":[176],"replacement.":[178],"Experimental":[179],"results":[180],"LibriTTS":[183],"datasets":[184],"demonstrate":[185],"our":[187],"proposed":[188],"significantly":[190],"editing,":[195],"speech.":[201]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-06T07:47:59.780226","created_date":"2025-10-10T00:00:00"}
