{"id":"https://openalex.org/W3205316472","doi":"https://doi.org/10.1109/icassp43922.2022.9747801","title":"Using Multiple Reference Audios and Style Embedding Constraints for Speech Synthesis","display_name":"Using Multiple Reference Audios and Style Embedding Constraints for Speech Synthesis","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W3205316472","doi":"https://doi.org/10.1109/icassp43922.2022.9747801","mag":"3205316472"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9747801","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747801","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5107748760","display_name":"Gong Cheng","orcid":"https://orcid.org/0009-0004-0272-3541"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Cheng Gong","raw_affiliation_strings":["Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050763764","display_name":"Longbiao Wang","orcid":"https://orcid.org/0000-0002-4005-5036"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longbiao Wang","raw_affiliation_strings":["Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059767940","display_name":"Zhen-Hua Ling","orcid":"https://orcid.org/0000-0001-7853-5273"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210103894","display_name":"Hefei National Center for Physical Sciences at Nanoscale","ror":"https://ror.org/01jeedh73","country_code":"CN","type":"facility","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366","https://openalex.org/I4210103894"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenhua Ling","raw_affiliation_strings":["National Engineering Laboratory for Speech and Language Information Processing, University of Science and Technology of China,Hefei,China","National Engineering Laboratory for Speech and Language Information Processing, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Laboratory for Speech and Language Information Processing, University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041","https://openalex.org/I4210103894"]},{"raw_affiliation_string":"National Engineering Laboratory for Speech and Language Information Processing, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100620206","display_name":"Ju Zhang","orcid":"https://orcid.org/0000-0002-0378-6497"},"institutions":[{"id":"https://openalex.org/I4210094894","display_name":"China Automotive Technology and Research Center","ror":"https://ror.org/00r5r6807","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210094894"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ju Zhang","raw_affiliation_strings":["Huiyan Technology (Tianjin) Co., Ltd.,China","Huiyan Technology (Tianjin) Co., Ltd., China"],"affiliations":[{"raw_affiliation_string":"Huiyan Technology (Tianjin) Co., Ltd.,China","institution_ids":["https://openalex.org/I4210094894"]},{"raw_affiliation_string":"Huiyan Technology (Tianjin) Co., Ltd., China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017251198","display_name":"Jianwu Dang","orcid":"https://orcid.org/0000-0002-9237-4821"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]},{"id":"https://openalex.org/I177738480","display_name":"Japan Advanced Institute of Science and Technology","ror":"https://ror.org/03frj4r98","country_code":"JP","type":"education","lineage":["https://openalex.org/I177738480"]}],"countries":["CN","JP"],"is_corresponding":false,"raw_author_name":"Jianwu Dang","raw_affiliation_strings":["Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University,Tianjin,China","Japan Advanced Institute of Science and Technology, Ishikawa, Japan","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Japan Advanced Institute of Science and Technology, Ishikawa, Japan","institution_ids":["https://openalex.org/I177738480"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5107748760"],"corresponding_institution_ids":["https://openalex.org/I162868743"],"apc_list":null,"apc_paid":null,"fwci":0.5231,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.61069592,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"7912","last_page":"7916"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9955000281333923,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7636536955833435},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.6975091099739075},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.668720543384552},{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.6597404479980469},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5699066519737244},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.49267059564590454},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.48551130294799805},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.47828754782676697},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4745003879070282},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.46243536472320557},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.41406333446502686},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3962956368923187}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7636536955833435},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.6975091099739075},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.668720543384552},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.6597404479980469},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5699066519737244},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.49267059564590454},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.48551130294799805},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.47828754782676697},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4745003879070282},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.46243536472320557},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.41406333446502686},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3962956368923187},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9747801","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747801","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.49000000953674316}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W2794490148","https://openalex.org/W2795109282","https://openalex.org/W2803832867","https://openalex.org/W2885800352","https://openalex.org/W2896457183","https://openalex.org/W2903739847","https://openalex.org/W2904459034","https://openalex.org/W2906797124","https://openalex.org/W2914049472","https://openalex.org/W2962691331","https://openalex.org/W2963272440","https://openalex.org/W2963341956","https://openalex.org/W2963609956","https://openalex.org/W2963927338","https://openalex.org/W2964243274","https://openalex.org/W2964307104","https://openalex.org/W2971905065","https://openalex.org/W2990883660","https://openalex.org/W3015212100","https://openalex.org/W3161436426","https://openalex.org/W3161732385","https://openalex.org/W3174758275","https://openalex.org/W3196467321","https://openalex.org/W4295731579","https://openalex.org/W4298580827","https://openalex.org/W6736996214","https://openalex.org/W6748409065","https://openalex.org/W6750489868","https://openalex.org/W6752051073","https://openalex.org/W6755207826","https://openalex.org/W6757422803","https://openalex.org/W6768158615","https://openalex.org/W6796730497"],"related_works":["https://openalex.org/W4392904630","https://openalex.org/W169399214","https://openalex.org/W258725851","https://openalex.org/W4391272374","https://openalex.org/W1914543332","https://openalex.org/W2946856121","https://openalex.org/W40885451","https://openalex.org/W1927421023","https://openalex.org/W10581632","https://openalex.org/W2108985546"],"abstract_inverted_index":{"The":[0,145],"end-to-end":[1],"speech":[2,15,50,61,70,156],"synthesis":[3],"model":[4,67,152,171],"can":[5,153,166],"directly":[6],"take":[7],"an":[8,30],"utterance":[9],"as":[10,130],"reference":[11,27,88,102,163],"audio,":[12],"and":[13,21,49,60,90,141,158,165],"generate":[14],"from":[16,116,126],"the":[17,26,42,46,54,66,98,108,135,139,150,155,169],"text":[18,48,59],"with":[19,71,161],"prosody":[20],"speaker":[22],"characteristics":[23],"similar":[24],"to":[25,41,68,80],"audio.":[28,100],"However,":[29],"appropriate":[31],"acoustic":[32],"embedding":[33,92,125],"must":[34],"be":[35],"manually":[36],"selected":[37,106],"during":[38],"inference.":[39],"Due":[40],"fact":[43],"that":[44,149],"only":[45,97],"matched":[47],"are":[51,104],"used":[52],"in":[53,172],"training":[55],"process,":[56],"using":[57,86,96,107],"unmatched":[58],"for":[62],"inference":[63],"would":[64],"cause":[65],"synthesize":[69],"low":[72],"content":[73,159],"quality.":[74],"In":[75,119],"this":[76],"study,":[77],"we":[78,121],"propose":[79],"mitigate":[81],"these":[82],"two":[83],"problems":[84],"by":[85,112,133],"multiple":[87,162],"audios":[89,103,164],"style":[91,124,143,177],"constraints":[93],"rather":[94],"than":[95],"target":[99],"Multiple":[101],"automatically":[105],"sentence":[109],"similarity":[110],"determined":[111],"Bidirectional":[113],"Encoder":[114],"Representations":[115],"Transformers":[117],"(BERT).":[118],"addition,":[120],"use":[122],"\"target\"":[123,142],"a":[127,131],"pre-trained":[128],"encoder":[129],"constraint":[132],"considering":[134],"mutual":[136],"information":[137],"between":[138],"predicted":[140],"embedding.":[144],"experimental":[146],"results":[147],"show":[148],"proposed":[151],"improve":[154],"naturalness":[157],"quality":[160],"also":[167],"outperform":[168],"baseline":[170],"ABX":[173],"preference":[174],"tests":[175],"of":[176],"similarity.":[178]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":3},{"year":2022,"cited_by_count":1}],"updated_date":"2026-03-25T13:04:00.132906","created_date":"2025-10-10T00:00:00"}
