{"id":"https://openalex.org/W3197943112","doi":"https://doi.org/10.21437/interspeech.2021-1351","title":"Enriching Source Style Transfer in Recognition-Synthesis Based Non-Parallel Voice Conversion","display_name":"Enriching Source Style Transfer in Recognition-Synthesis Based Non-Parallel Voice Conversion","publication_year":2021,"publication_date":"2021-08-27","ids":{"openalex":"https://openalex.org/W3197943112","doi":"https://doi.org/10.21437/interspeech.2021-1351","mag":"3197943112"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2021-1351","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-1351","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106698700","display_name":"Zhichao Wang","orcid":"https://orcid.org/0000-0001-8075-1784"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhichao Wang","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014876636","display_name":"Xinyong Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinyong Zhou","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036283087","display_name":"Fengyu Yang","orcid":"https://orcid.org/0009-0005-5410-4707"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fengyu Yang","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100455256","display_name":"Tao Li","orcid":"https://orcid.org/0000-0001-8966-7239"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Li","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057476853","display_name":"Hongqiang Du","orcid":"https://orcid.org/0000-0002-4168-9655"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongqiang Du","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066245750","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-9051-2111"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020922543","display_name":"Wendong Gan","orcid":null},"institutions":[{"id":"https://openalex.org/I4210115040","display_name":"iQIYI (China)","ror":"https://ror.org/026xvz038","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210115040"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wendong Gan","raw_affiliation_strings":["iQIYI Inc, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"iQIYI Inc, China","institution_ids":["https://openalex.org/I4210115040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100737752","display_name":"Haitao Chen","orcid":"https://orcid.org/0000-0003-2262-4781"},"institutions":[{"id":"https://openalex.org/I4210115040","display_name":"iQIYI (China)","ror":"https://ror.org/026xvz038","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210115040"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haitao Chen","raw_affiliation_strings":["iQIYI Inc, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"iQIYI Inc, China","institution_ids":["https://openalex.org/I4210115040"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100429403","display_name":"Hai Li","orcid":"https://orcid.org/0000-0003-3228-6544"},"institutions":[{"id":"https://openalex.org/I4210115040","display_name":"iQIYI (China)","ror":"https://ror.org/026xvz038","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210115040"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hai Li","raw_affiliation_strings":["iQIYI Inc, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"iQIYI Inc, China","institution_ids":["https://openalex.org/I4210115040"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.099,"has_fulltext":false,"cited_by_count":18,"citation_normalized_percentile":{"value":0.89425522,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"831","last_page":"835"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9925000071525574,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9337999820709229,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7726526260375977},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6729326844215393},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.4861104488372803},{"id":"https://openalex.org/keywords/transfer","display_name":"Transfer (computing)","score":0.46802622079849243},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3342171311378479},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3306828737258911},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.19417253136634827}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7726526260375977},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6729326844215393},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.4861104488372803},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.46802622079849243},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3342171311378479},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3306828737258911},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.19417253136634827},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2021-1351","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-1351","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.46000000834465027}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1731081199","https://openalex.org/W1977362459","https://openalex.org/W2017425464","https://openalex.org/W2120605154","https://openalex.org/W2121387787","https://openalex.org/W2156142001","https://openalex.org/W2156477760","https://openalex.org/W2187089797","https://openalex.org/W2404839462","https://openalex.org/W2471520273","https://openalex.org/W2517513811","https://openalex.org/W2518172956","https://openalex.org/W2532494225","https://openalex.org/W2587088898","https://openalex.org/W2794490148","https://openalex.org/W2801581493","https://openalex.org/W2904459034","https://openalex.org/W2945478979","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2972313557","https://openalex.org/W2972659941","https://openalex.org/W3094635600","https://openalex.org/W3095401840","https://openalex.org/W3097892637","https://openalex.org/W3134921434","https://openalex.org/W3163568691","https://openalex.org/W4288079858","https://openalex.org/W4295731579","https://openalex.org/W4298580827"],"related_works":["https://openalex.org/W2356229341","https://openalex.org/W2349768204","https://openalex.org/W4313326281","https://openalex.org/W574867512","https://openalex.org/W2387271333","https://openalex.org/W381115453","https://openalex.org/W581389233","https://openalex.org/W2361120309","https://openalex.org/W2387981414","https://openalex.org/W3192589309"],"abstract_inverted_index":{"Current":[0],"voice":[1],"conversion":[2],"(VC)":[3],"methods":[4,83],"can":[5,49],"successfully":[6],"convert":[7],"timbre":[8],"of":[9,25,67,159,181],"the":[10,30,71,127,156,163,173,185],"audio.As":[11],"modeling":[12],"source":[13,27,36,132,160],"audio's":[14],"prosody":[15,48,61,72,87],"effectively":[16,78],"is":[17,120,170],"a":[18,35,59,74,85,140,176],"challenging":[19],"task,":[20,47],"there":[21],"are":[22,91,102,191],"still":[23],"limitations":[24],"transferring":[26,136],"style":[28,37,182],"to":[29,93,104,122,145,172],"converted":[31],"speech.This":[32],"study":[33],"proposes":[34],"transfer":[38],"method":[39],"based":[40,143],"on":[41],"recognitionsynthesis":[42],"framework.Previously":[43],"in":[44,73,84,179],"speech":[45,161,186],"generation":[46],"be":[50],"modeled":[51],"explicitly":[52],"with":[53,58],"prosodic":[54,89,157],"features":[55,90],"or":[56],"implicitly":[57,105,154],"latent":[60],"extractor.In":[62],"this":[63],"paper,":[64],"taking":[65],"advantages":[66],"both,":[68],"we":[69,138],"model":[70,95,106],"hybrid":[75],"manner,":[76],"which":[77,108,152],"combines":[79],"explicit":[80,94],"and":[81,99,112,175,188],"implicit":[82],"proposed":[86],"module.Specifically,":[88],"used":[92,103],"prosody,":[96,107],"while":[97,135],"VAE":[98,128],"reference":[100],"encoder":[101,144],"take":[109],"Mel":[110],"spectrum":[111],"bottleneck":[113,150],"feature":[114],"as":[115],"input":[116],"respectively.Furthermore,":[117],"adversarial":[118],"training":[119],"introduced":[121],"remove":[123],"speakerrelated":[124],"information":[125,134],"from":[126,149,162],"outputs,":[129],"avoiding":[130],"leaking":[131],"speaker":[133,189],"style.Finally,":[137],"use":[139],"modified":[141],"self-attention":[142],"extract":[146],"sentential":[147],"context":[148],"features,":[151],"also":[153],"aggregates":[155],"aspects":[158],"layered":[164],"representations.Experiments":[165],"show":[166],"that":[167],"our":[168],"approach":[169],"superior":[171],"baseline":[174],"competitive":[177],"system":[178],"terms":[180],"transfer;":[183],"meanwhile,":[184],"quality":[187],"similarity":[190],"well":[192],"maintained.":[193]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":7},{"year":2022,"cited_by_count":4}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
