{"id":"https://openalex.org/W4376852275","doi":"https://doi.org/10.1145/3573942.3574120","title":"Voicifier-LN: An Novel Approach to Elevate the Speaker Similarity for General Zero-shot Multi-Speaker TTS","display_name":"Voicifier-LN: An Novel Approach to Elevate the Speaker Similarity for General Zero-shot Multi-Speaker TTS","publication_year":2022,"publication_date":"2022-09-23","ids":{"openalex":"https://openalex.org/W4376852275","doi":"https://doi.org/10.1145/3573942.3574120"},"language":"en","primary_location":{"id":"doi:10.1145/3573942.3574120","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1145/3573942.3574120","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2022 5th International Conference on Artificial Intelligence and Pattern Recognition","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102845007","display_name":"Dengfeng Ke","orcid":"https://orcid.org/0000-0001-8459-0412"},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Dengfeng Ke","raw_affiliation_strings":["Beijing Language and Culture University, China"],"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068030070","display_name":"Liangjie Huang","orcid":"https://orcid.org/0000-0003-0770-1582"},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liangjie Huang","raw_affiliation_strings":["Beijing Language and Culture University, China"],"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001927761","display_name":"Wenhan Yao","orcid":"https://orcid.org/0000-0003-1014-9565"},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenhan Yao","raw_affiliation_strings":["Beijing Language and Culture University, China"],"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076551862","display_name":"Ruixin Hu","orcid":"https://orcid.org/0000-0002-0999-0712"},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruixin Hu","raw_affiliation_strings":["Beijing Language and Culture University, China"],"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110976426","display_name":"Xueyin Zu","orcid":null},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xueyin Zu","raw_affiliation_strings":["Beijing Language and Culture University, China"],"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006684220","display_name":"Yanlu Xie","orcid":"https://orcid.org/0000-0001-6765-4808"},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanlu Xie","raw_affiliation_strings":["Beijing Language and Culture University, China"],"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University, China","institution_ids":["https://openalex.org/I115212828"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052176517","display_name":"Jinsong Zhang","orcid":"https://orcid.org/0000-0002-1603-3136"},"institutions":[{"id":"https://openalex.org/I115212828","display_name":"Beijing Language and Culture University","ror":"https://ror.org/03te2zs36","country_code":"CN","type":"education","lineage":["https://openalex.org/I115212828"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinsong Zhang","raw_affiliation_strings":["Beijing Language and Culture University, China"],"affiliations":[{"raw_affiliation_string":"Beijing Language and Culture University, China","institution_ids":["https://openalex.org/I115212828"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5102845007"],"corresponding_institution_ids":["https://openalex.org/I115212828"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.17604375,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"15","issue":null,"first_page":"950","last_page":"955"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7234710454940796},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.7015202045440674},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6949297785758972},{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.6924551725387573},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4977912902832031},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.4841757118701935},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.4760337173938751},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.42569127678871155},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4101673662662506},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3835276663303375},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.06766322255134583}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7234710454940796},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.7015202045440674},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6949297785758972},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.6924551725387573},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4977912902832031},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4841757118701935},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.4760337173938751},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.42569127678871155},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4101673662662506},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3835276663303375},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.06766322255134583},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C19165224","wikidata":"https://www.wikidata.org/wiki/Q23404","display_name":"Anthropology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3573942.3574120","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1145/3573942.3574120","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2022 5th International Conference on Artificial Intelligence and Pattern Recognition","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5199999809265137,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W1875842236","https://openalex.org/W2794490148","https://openalex.org/W2970006822","https://openalex.org/W2973034126","https://openalex.org/W2982277844","https://openalex.org/W3020570669","https://openalex.org/W3021117141","https://openalex.org/W3048869804","https://openalex.org/W3168527213","https://openalex.org/W4221156079","https://openalex.org/W4221159457","https://openalex.org/W6736996214","https://openalex.org/W6754224190"],"related_works":["https://openalex.org/W2530685530","https://openalex.org/W4375868962","https://openalex.org/W2011227383","https://openalex.org/W2088854863","https://openalex.org/W4402568167","https://openalex.org/W3179495260","https://openalex.org/W1976719989","https://openalex.org/W3127543252","https://openalex.org/W2065606036","https://openalex.org/W2998781440"],"abstract_inverted_index":{"Speeches":[0],"generated":[1],"from":[2,31,68],"neural":[3],"network-based":[4],"Text-to-Speech":[5],"(TTS)":[6],"have":[7],"been":[8],"becoming":[9],"more":[10,140,146],"natural":[11],"and":[12,61,80,117,129,150],"intelligible.":[13],"However,":[14],"the":[15,59,69,72,78,84,100,104,109,134,148,157,175,181],"evident":[16],"dropping":[17],"performance":[18],"still":[19],"exists":[20],"when":[21],"synthesizing":[22],"multi-speaker":[23],"speeches":[24,119],"in":[25],"zero-shot":[26,89],"manner,":[27],"especially":[28],"for":[29,87],"those":[30],"different":[32,35],"countries":[33],"with":[34,103,120],"accents.":[36],"To":[37,153],"bridge":[38],"this":[39],"gap,":[40],"we":[41,91],"propose":[42,92],"a":[43],"novel":[44],"method,":[45],"called":[46],"Voicifier.":[47],"It":[48],"firstly":[49],"operates":[50],"on":[51],"high":[52,115],"frequency":[53],"mel-spectrogram":[54],"bins":[55],"to":[56,71,75,113,138,161,166,172,179],"approximately":[57],"remove":[58],"content":[60,79,149],"rhythm.":[62],"Then":[63],"Voicifier":[64],"uses":[65],"two":[66],"strategies,":[67],"shallow":[70],"deep":[73],"mixing,":[74],"further":[76],"destroy":[77],"rhythm":[81],"but":[82],"retain":[83,139],"timbre.":[85],"Furthermore,":[86],"better":[88],"performance,":[90],"Voice-Pin":[93],"Layer":[94],"Normalization":[95],"(VPLN)":[96],"which":[97],"pins":[98],"down":[99],"timbre":[101,142],"according":[102],"text":[105],"feature.":[106],"During":[107],"inference,":[108],"model":[110],"is":[111,165],"allowed":[112],"synthesize":[114],"quality":[116],"similarity":[118],"just":[121],"around":[122],"1":[123],"sec":[124],"target":[125,141],"speech":[126],"audio.":[127],"Experiments":[128],"ablation":[130],"studies":[131],"prove":[132],"that":[133,164],"methods":[135,158],"are":[136,159],"able":[137],"while":[143],"abandoning":[144],"much":[145],"of":[147,174,183],"rhythm-related":[151],"information.":[152],"our":[154],"best":[155],"knowledge,":[156],"found":[160],"be":[162,170],"universal":[163],"say":[167],"it":[168],"can":[169],"applied":[171],"most":[173],"existing":[176],"TTS":[177],"systems":[178],"enhance":[180],"ability":[182],"cross-speaker":[184],"synthesis.":[185]},"counts_by_year":[],"updated_date":"2025-12-24T23:09:58.560324","created_date":"2025-10-10T00:00:00"}
