{"id":"https://openalex.org/W3203614938","doi":"https://doi.org/10.1109/icassp43922.2022.9747319","title":"Cloning One\u2019s Voice Using Very Limited Data in the Wild","display_name":"Cloning One\u2019s Voice Using Very Limited Data in the Wild","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W3203614938","doi":"https://doi.org/10.1109/icassp43922.2022.9747319","mag":"3203614938"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9747319","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747319","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052785114","display_name":"Dongyang Dai","orcid":"https://orcid.org/0000-0001-5811-5781"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dongyang Dai","raw_affiliation_strings":["ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055175414","display_name":"Yuanzhe Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuanzhe Chen","raw_affiliation_strings":["ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100379262","display_name":"Li Chen","orcid":"https://orcid.org/0000-0002-5842-838X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li Chen","raw_affiliation_strings":["ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004787804","display_name":"Ming Tu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ming Tu","raw_affiliation_strings":["ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100396545","display_name":"Lu Liu","orcid":"https://orcid.org/0000-0003-2603-4973"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu Liu","raw_affiliation_strings":["ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035534911","display_name":"Rui Xia","orcid":"https://orcid.org/0000-0002-0621-1058"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rui Xia","raw_affiliation_strings":["ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018710096","display_name":"Qiao Tian","orcid":"https://orcid.org/0000-0001-8177-7724"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao Tian","raw_affiliation_strings":["ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100339106","display_name":"Yu\u2010Ping Wang","orcid":"https://orcid.org/0000-0001-9340-5864"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuping Wang","raw_affiliation_strings":["ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103854502","display_name":"Yuxuan Wang","orcid":"https://orcid.org/0000-0001-8269-3354"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuxuan Wang","raw_affiliation_strings":["ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc.,Speech, Audio &#x0026; Music Intelligence (SAMI)","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5052785114"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.4552,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.83430295,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"8322","last_page":"8326"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9922999739646912,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.8533682823181152},{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.8086332082748413},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.7640514373779297},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7605612277984619},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7036501169204712},{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.5341716408729553},{"id":"https://openalex.org/keywords/cloning","display_name":"Cloning (programming)","score":0.5321397185325623},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4827265441417694},{"id":"https://openalex.org/keywords/popularity","display_name":"Popularity","score":0.48015135526657104},{"id":"https://openalex.org/keywords/clone","display_name":"clone (Java method)","score":0.4692426025867462},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3729866147041321},{"id":"https://openalex.org/keywords/musical","display_name":"Musical","score":0.12342241406440735},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.10010558366775513},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.07243770360946655}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.8533682823181152},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.8086332082748413},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.7640514373779297},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7605612277984619},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7036501169204712},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.5341716408729553},{"id":"https://openalex.org/C121050878","wikidata":"https://www.wikidata.org/wiki/Q5135020","display_name":"Cloning (programming)","level":2,"score":0.5321397185325623},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4827265441417694},{"id":"https://openalex.org/C2780586970","wikidata":"https://www.wikidata.org/wiki/Q1357284","display_name":"Popularity","level":2,"score":0.48015135526657104},{"id":"https://openalex.org/C81089528","wikidata":"https://www.wikidata.org/wiki/Q5134986","display_name":"clone (Java method)","level":3,"score":0.4692426025867462},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3729866147041321},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.12342241406440735},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10010558366775513},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.07243770360946655},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C552990157","wikidata":"https://www.wikidata.org/wiki/Q7430","display_name":"DNA","level":2,"score":0.0},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9747319","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747319","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W2518172956","https://openalex.org/W2519091744","https://openalex.org/W2608207374","https://openalex.org/W2788357188","https://openalex.org/W2808706139","https://openalex.org/W2946200149","https://openalex.org/W2949382160","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2970006822","https://openalex.org/W2980709326","https://openalex.org/W3016139610","https://openalex.org/W3025165719","https://openalex.org/W3036601975","https://openalex.org/W3095035471","https://openalex.org/W3097777922","https://openalex.org/W6726528559","https://openalex.org/W6748588790","https://openalex.org/W6752888775","https://openalex.org/W6763832098","https://openalex.org/W6767111847","https://openalex.org/W6780218876"],"related_works":["https://openalex.org/W2579204149","https://openalex.org/W2395605663","https://openalex.org/W2005104111","https://openalex.org/W4298392819","https://openalex.org/W3203614938","https://openalex.org/W4319439706","https://openalex.org/W3158085692","https://openalex.org/W1968786616","https://openalex.org/W1985829801","https://openalex.org/W4319586549"],"abstract_inverted_index":{"With":[0],"the":[1,8,41,47,53,59,70,76,98,105,112,117,120,128],"increasing":[2],"popularity":[3],"of":[4,73,79,116,127],"speech":[5,17,125,130],"synthesis":[6],"products,":[7],"industry":[9],"has":[10,101,131],"put":[11],"forward":[12],"more":[13,136],"requirements":[14],"for":[15,91],"personalized":[16],"synthesis:":[18],"(1)":[19],"How":[20,33],"to":[21,27,34,110],"use":[22],"low-resource,":[23],"easily":[24],"accessible":[25],"data":[26,96],"clone":[28,35],"a":[29,36],"person\u2019s":[30,37],"voice.":[31],"(2)":[32],"voice":[38],"while":[39,84],"controlling":[40,111],"style":[42,113],"and":[43,61,75,114],"prosody.":[44],"To":[45],"solve":[46],"above":[48],"two":[49,67],"problems,":[50],"we":[51],"proposed":[52],"Hieratron":[54,100],"model":[55],"framework":[56],"in":[57,97,108],"which":[58],"prosody":[60],"timbre":[62,74],"are":[63],"modeled":[64],"separately":[65],"using":[66],"modules,":[68],"therefore,":[69],"independent":[71],"control":[72],"other":[77],"characteristics":[78],"audio":[80],"can":[81],"be":[82],"achieved":[83],"generating":[85],"speech.":[86],"The":[87],"practice":[88],"shows":[89],"that,":[90],"very":[92],"limited":[93],"target":[94],"speaker":[95],"wild,":[99],"obvious":[102],"advantages":[103],"over":[104],"traditional":[106],"method,":[107],"addition":[109],"language":[115],"generated":[118,129],"speech,":[119],"mean":[121],"opinion":[122],"score":[123],"on":[124],"quality":[126],"also":[132],"been":[133],"improved":[134],"by":[135],"than":[137],"0.2":[138],"points.":[139]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
