{"id":"https://openalex.org/W4399168655","doi":"https://doi.org/10.1109/taslp.2024.3407577","title":"Multi-Level Temporal-Channel Speaker Retrieval for Zero-Shot Voice Conversion","display_name":"Multi-Level Temporal-Channel Speaker Retrieval for Zero-Shot Voice Conversion","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4399168655","doi":"https://doi.org/10.1109/taslp.2024.3407577"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3407577","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3407577","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106698700","display_name":"Zhichao Wang","orcid":"https://orcid.org/0000-0001-8075-1784"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhichao Wang","raw_affiliation_strings":["ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009337933","display_name":"Liumeng Xue","orcid":"https://orcid.org/0000-0003-2815-8494"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liumeng Xue","raw_affiliation_strings":["ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072482416","display_name":"Qiuqiang Kong","orcid":"https://orcid.org/0000-0003-2864-0475"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiuqiang Kong","raw_affiliation_strings":["ByteDance SAMI Group, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance SAMI Group, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055175414","display_name":"Yuanzhe Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuanzhe Chen","raw_affiliation_strings":["ByteDance SAMI Group, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance SAMI Group, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103162279","display_name":"Qiao Tian","orcid":"https://orcid.org/0000-0002-4078-1273"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao Tian","raw_affiliation_strings":["ByteDance SAMI Group, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance SAMI Group, Shanghai, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100339106","display_name":"Yu\u2010Ping Wang","orcid":"https://orcid.org/0000-0001-9340-5864"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuping Wang","raw_affiliation_strings":["ByteDance SAMI Group, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance SAMI Group, Shanghai, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5106698700"],"corresponding_institution_ids":["https://openalex.org/I17145004"],"apc_list":null,"apc_paid":null,"fwci":1.0878,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.8003296,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":96},"biblio":{"volume":"32","issue":null,"first_page":"2926","last_page":"2937"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5523381233215332},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.4880748987197876},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.44494205713272095},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43690919876098633},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.435764342546463},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4014052450656891},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.13816407322883606},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.10597163438796997}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5523381233215332},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.4880748987197876},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.44494205713272095},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43690919876098633},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.435764342546463},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4014052450656891},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.13816407322883606},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.10597163438796997},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3407577","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3407577","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.4099999964237213}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":58,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1509691205","https://openalex.org/W1901129140","https://openalex.org/W2120605154","https://openalex.org/W2126143605","https://openalex.org/W2156142001","https://openalex.org/W2518172956","https://openalex.org/W2532494225","https://openalex.org/W2752782242","https://openalex.org/W2808631503","https://openalex.org/W2900348361","https://openalex.org/W2951454267","https://openalex.org/W2962896155","https://openalex.org/W2963539064","https://openalex.org/W2963609956","https://openalex.org/W2972359262","https://openalex.org/W2972659941","https://openalex.org/W3024869864","https://openalex.org/W3096524539","https://openalex.org/W3096558818","https://openalex.org/W3096609285","https://openalex.org/W3109064156","https://openalex.org/W3161627112","https://openalex.org/W3162390194","https://openalex.org/W3162512456","https://openalex.org/W3163475957","https://openalex.org/W3163573274","https://openalex.org/W3163654539","https://openalex.org/W3168542456","https://openalex.org/W3196616101","https://openalex.org/W3197659778","https://openalex.org/W3197763626","https://openalex.org/W3197943112","https://openalex.org/W3198020407","https://openalex.org/W3198082505","https://openalex.org/W3200756692","https://openalex.org/W3202267900","https://openalex.org/W3213785244","https://openalex.org/W4214727094","https://openalex.org/W4221141917","https://openalex.org/W4221154746","https://openalex.org/W4283731195","https://openalex.org/W4283832640","https://openalex.org/W4296068587","https://openalex.org/W4296068991","https://openalex.org/W4375869253","https://openalex.org/W4385245566","https://openalex.org/W4386536200","https://openalex.org/W6603838645","https://openalex.org/W6682137061","https://openalex.org/W6749555683","https://openalex.org/W6762533536","https://openalex.org/W6776390925","https://openalex.org/W6786325012","https://openalex.org/W6802029025","https://openalex.org/W6803547063","https://openalex.org/W6810447587","https://openalex.org/W6936113694"],"related_works":["https://openalex.org/W2529301793","https://openalex.org/W2384121599","https://openalex.org/W2038083449","https://openalex.org/W3177678247","https://openalex.org/W1999617572","https://openalex.org/W2944572343","https://openalex.org/W2333799855","https://openalex.org/W2351687372","https://openalex.org/W2004087835","https://openalex.org/W2314871050"],"abstract_inverted_index":{"Zero-shot":[0],"voice":[1,9],"conversion":[2],"(VC)":[3],"converts":[4],"source":[5],"speech":[6,228,260,298],"into":[7],"the":[8,19,56,75,78,90,141,146,152,213,223,230,286],"of":[10,18,58,68,77,151,215,226],"any":[11],"desired":[12],"speaker":[13,20,30,35,41,52,59,72,143,159,196,204,232,240,293],"using":[14],"only":[15],"one":[16],"utterance":[17],"without":[21],"requiring":[22],"additional":[23],"model":[24,38,80,102],"updates.":[25],"Typical":[26],"methods":[27,54,290],"use":[28],"a":[29,33,98,156,216,248],"representation":[31,42,205],"from":[32,206,242],"pre-trained":[34,217],"verification":[36],"(SV)":[37],"or":[39],"learn":[40],"during":[43],"VC":[44,79,101,289],"training":[45,91,250],"to":[46,81,133,137,140,186,238,252,257,285],"achieve":[47,258],"zero-shot":[48,100,254,288],"VC.":[49],"However,":[50],"existing":[51],"modeling":[53,73,160,292],"overlook":[55],"variation":[57],"information":[60,197],"richness":[61],"in":[62,89,145,199,291],"temporal":[63,147,208],"and":[64,148,192,209,262,277],"frequency":[65],"channel":[66,149,210],"dimensions":[67,211],"speech.":[69,200],"This":[70],"insufficient":[71],"hampers":[74],"ability":[76],"accurately":[82],"represent":[83],"unseen":[84],"speakers":[85],"who":[86],"are":[87],"not":[88],"dataset.":[92],"In":[93],"this":[94,266],"study,":[95],"we":[96,154,246,268],"present":[97],"robust":[99],"with":[103],"<italic":[104,110,116,122,128,163,169,175,181,189,193],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[105,108,111,114,117,120,123,126,129,164,167,170,173,176,179,182,190,194],"xmlns:xlink=\"http://www.w3.org/1999/xlink\"/>":[106,165],"<bold":[107,113,119,125,166,172,178],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">m</b>":[109],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">ulti-level</i>":[112],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">t</b>":[115,168],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">emporal-</i>":[118,171],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">c</b>":[121,174],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">hannel</i>":[124,177],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">r</b>":[127,180],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">etrieval</i>":[130],",":[131,185],"referred":[132],"as":[134],"MTCR-VC.":[135],"Specifically,":[136],"flexibly":[138],"adapt":[139],"dynamic-variant":[142],"characteristic":[144],"axis":[150],"speech,":[153],"propose":[155],"novel":[157],"fine-grained":[158],"method,":[161],"called":[162],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">etrieval":[183],"(TCR)</i>":[184],"find":[187],"out":[188],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">when</i>":[191],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">where</i>":[195],"appears":[198],"It":[201],"retrieves":[202],"variable-length":[203],"both":[207],"under":[212],"guidance":[214],"SV":[218],"model.":[219],"Besides,":[220],"inspired":[221],"by":[222],"hierarchical":[224],"process":[225],"human":[227],"production,":[229],"MTCR":[231],"module":[233],"stacks":[234],"several":[235],"TCR":[236],"blocks":[237],"extract":[239],"representations":[241],"multi-granularity":[243],"levels.":[244],"Furthermore,":[245],"introduce":[247],"cycle-based":[249],"strategy":[251],"simulate":[253],"inference":[255],"recurrently":[256],"better":[259],"disentanglement":[261],"reconstruction.":[263],"To":[264],"drive":[265],"process,":[267],"adopt":[269],"perceptual":[270],"constraints":[271],"on":[272],"three":[273],"aspects:":[274],"content,":[275],"style,":[276],"speaker.":[278],"Experiments":[279],"demonstrate":[280],"that":[281],"MTCR-VC":[282],"is":[283],"superior":[284],"previous":[287],"timbre":[294],"while":[295],"maintaining":[296],"good":[297],"naturalness.":[299]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
