{"id":"https://openalex.org/W4403780681","doi":"https://doi.org/10.1145/3664647.3681695","title":"VoiceTuner: Self-Supervised Pre-training and Efficient Fine-tuning For Voice Generation","display_name":"VoiceTuner: Self-Supervised Pre-training and Efficient Fine-tuning For Voice Generation","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403780681","doi":"https://doi.org/10.1145/3664647.3681695"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681695","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681695","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011787904","display_name":"Rongjie Huang","orcid":"https://orcid.org/0000-0002-1695-9000"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Rongjie Huang","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115603821","display_name":"Yongqi Wang","orcid":"https://orcid.org/0000-0003-4695-3440"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yongqi Wang","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100586774","display_name":"Ruofan Hu","orcid":"https://orcid.org/0009-0005-1723-6778"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruofan Hu","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019256380","display_name":"Xiaoshan Xu","orcid":"https://orcid.org/0000-0002-4363-392X"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Xiaoshan Xu","raw_affiliation_strings":["The University of Hong Kong, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"The University of Hong Kong, Hangzhou, China","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077830473","display_name":"Zhiqing Hong","orcid":"https://orcid.org/0000-0003-3682-4290"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiqing Hong","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028178332","display_name":"Dongchao Yang","orcid":"https://orcid.org/0000-0003-0879-4047"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Dongchao Yang","raw_affiliation_strings":["The Chinese University of Hong Kong, Hong Kong, Hong Kong"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009897266","display_name":"Xize Cheng","orcid":"https://orcid.org/0000-0001-9708-3225"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xize Cheng","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111185441","display_name":"Zehan Wang","orcid":"https://orcid.org/0009-0007-6426-3749"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zehan Wang","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073215052","display_name":"Ziyue Jiang","orcid":"https://orcid.org/0009-0005-1358-8098"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziyue Jiang","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016502904","display_name":"Zhenhui Ye","orcid":"https://orcid.org/0000-0002-7105-014X"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenhui Ye","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113150465","display_name":"Luping Liu","orcid":"https://orcid.org/0000-0003-2424-3560"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Luping Liu","raw_affiliation_strings":["The University of Hong Kong, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"The University of Hong Kong, Hangzhou, China","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014586639","display_name":"Siqi Zheng","orcid":"https://orcid.org/0000-0002-4467-8505"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Siqi Zheng","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5079260216","display_name":"Zhou Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhou Zhao","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5011787904"],"corresponding_institution_ids":["https://openalex.org/I76130692"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.16661999,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"10630","last_page":"10639"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7151068449020386},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.7130247354507446},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5236061215400696},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3567584753036499}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7151068449020386},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.7130247354507446},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5236061215400696},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3567584753036499},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664647.3681695","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681695","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W2131738223","https://openalex.org/W2981852735","https://openalex.org/W2995181338","https://openalex.org/W3129009457","https://openalex.org/W3158762648","https://openalex.org/W3203407300","https://openalex.org/W3206191467","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3215615641","https://openalex.org/W4285345683","https://openalex.org/W4285483774","https://openalex.org/W4288089799","https://openalex.org/W4375869257","https://openalex.org/W4394671563","https://openalex.org/W6840200333"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W230091440","https://openalex.org/W2390279801","https://openalex.org/W2233261550","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2810751659"],"abstract_inverted_index":{"Voice":[0],"large":[1],"language":[2,10],"models":[3],"(LLMs)":[4],"cast":[5],"voice":[6,32,62,152],"synthesis":[7,153],"as":[8,112],"a":[9,14,53,101,113],"modeling":[11],"task":[12],"in":[13,34,85,96,130],"discrete":[15],"space,":[16],"and":[17,42,56,75,125,150,159,166],"have":[18],"demonstrated":[19],"significant":[20],"progress":[21],"to":[22,66,89,105],"date.":[23],"Despite":[24],"the":[25,28,92],"recent":[26],"success,":[27],"current":[29],"development":[30],"of":[31],"LLMs":[33],"low-resource":[35,61],"applications":[36,144],"is":[37],"hampered":[38],"by":[39],"data":[40,68,164],"scarcity":[41],"high":[43,93],"computational":[44,167],"cost.":[45,168],"In":[46],"this":[47],"work,":[48],"we":[49,70,99],"propose":[50],"VoiceTuner,":[51],"with":[52,135,162],"self-supervised":[54],"pre-training":[55],"efficient":[57],"fine-tuning":[58],"approach":[59],"for":[60],"generation.":[63],"Specifically,":[64],"1)":[65],"mitigate":[67],"scarcity,":[69],"leverage":[71],"large-scale":[72],"unlabeled":[73],"dataset":[74],"pre-train":[76],"VoiceTuner-SSL":[77,120],"without":[78],"pre-defined":[79],"applications,":[80],"which":[81],"can":[82],"be":[83],"fine-tuned":[84],"downstream":[86,143],"tasks;":[87],"2)":[88],"further":[90],"reduce":[91],"training":[94],"cost":[95],"complete":[97],"fine-tuning,":[98],"introduce":[100],"multiscale":[102],"transformer":[103],"adapter":[104],"effectively":[106],"update":[107],"only":[108],"around":[109],"1%":[110],"parameters":[111],"plug-and-play":[114],"module.":[115],"Experimental":[116],"results":[117,129],"demonstrate":[118],"that":[119],"presents":[121],"strong":[122],"acoustic":[123],"continuations,":[124],"VoiceTuner":[126],"achieves":[127],"state-of-the-art":[128],"rich-resource":[131],"TTS":[132],"evaluation":[133],"compared":[134],"competitive":[136],"baseline":[137],"models.":[138],"Low-resource":[139],"(1h,":[140],"10h,":[141],"30h)":[142],"including":[145],"zero-shot":[146],"TTS,":[147,149],"instruction":[148],"singing":[151],"present":[154],"VoiceTuner's":[155],"superior":[156],"audio":[157],"quality":[158],"style":[160],"similarity":[161],"reduced":[163],"requirement":[165],"Audio":[169],"samples":[170],"are":[171],"available":[172],"at":[173],"https://VoiceTuner.github.io":[174]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
