{"id":"https://openalex.org/W4392910523","doi":"https://doi.org/10.1109/icassp48485.2024.10446229","title":"Dualvc 2: Dynamic Masked Convolution for Unified Streaming and Non-Streaming Voice Conversion","display_name":"Dualvc 2: Dynamic Masked Convolution for Unified Streaming and Non-Streaming Voice Conversion","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392910523","doi":"https://doi.org/10.1109/icassp48485.2024.10446229"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446229","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10446229","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081164682","display_name":"Ziqian Ning","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]},{"id":"https://openalex.org/I4210091137","display_name":"NetEase (China)","ror":"https://ror.org/00fp6fj05","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210091137"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziqian Ning","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","Fuxi AI Lab, NetEase Inc., Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]},{"raw_affiliation_string":"Fuxi AI Lab, NetEase Inc., Hangzhou, China","institution_ids":["https://openalex.org/I4210091137"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018041731","display_name":"Yuepeng Jiang","orcid":"https://orcid.org/0000-0002-1444-7183"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuepeng Jiang","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050166453","display_name":"Pengcheng Zhu","orcid":"https://orcid.org/0000-0001-9867-7041"},"institutions":[{"id":"https://openalex.org/I4210091137","display_name":"NetEase (China)","ror":"https://ror.org/00fp6fj05","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210091137"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengcheng Zhu","raw_affiliation_strings":["NetEase Inc.,Fuxi AI Lab,Hangzhou,China","Fuxi AI Lab, NetEase Inc., Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NetEase Inc.,Fuxi AI Lab,Hangzhou,China","institution_ids":["https://openalex.org/I4210091137"]},{"raw_affiliation_string":"Fuxi AI Lab, NetEase Inc., Hangzhou, China","institution_ids":["https://openalex.org/I4210091137"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100328340","display_name":"Shuai Wang","orcid":"https://orcid.org/0000-0003-1523-9631"},"institutions":[{"id":"https://openalex.org/I4210099586","display_name":"Shenzhen Research Institute of Big Data","ror":"https://ror.org/00z1gwf89","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099586"]},{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuai Wang","raw_affiliation_strings":["The Chinese University of Hong Kong,Shenzhen Research Institute of Big Data,Shenzhen (CUHK-Shenzhen),China","Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong,Shenzhen Research Institute of Big Data,Shenzhen (CUHK-Shenzhen),China","institution_ids":["https://openalex.org/I4210116924"]},{"raw_affiliation_string":"Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), China","institution_ids":["https://openalex.org/I4210116924","https://openalex.org/I4210099586"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015560758","display_name":"Jixun Yao","orcid":"https://orcid.org/0000-0002-5324-7360"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jixun Yao","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036369578","display_name":"Mengxiao Bi","orcid":"https://orcid.org/0009-0007-6680-481X"},"institutions":[{"id":"https://openalex.org/I4210091137","display_name":"NetEase (China)","ror":"https://ror.org/00fp6fj05","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210091137"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengxiao Bi","raw_affiliation_strings":["NetEase Inc.,Fuxi AI Lab,Hangzhou,China","Fuxi AI Lab, NetEase Inc., Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NetEase Inc.,Fuxi AI Lab,Hangzhou,China","institution_ids":["https://openalex.org/I4210091137"]},{"raw_affiliation_string":"Fuxi AI Lab, NetEase Inc., Hangzhou, China","institution_ids":["https://openalex.org/I4210091137"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.5273,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.8448049,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"11106","last_page":"11110"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9815000295639038,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9815000295639038,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9801999926567078,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9039000272750854,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7934234142303467},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.6481680274009705},{"id":"https://openalex.org/keywords/streaming-current","display_name":"Streaming current","score":0.6403868198394775},{"id":"https://openalex.org/keywords/video-streaming","display_name":"Video streaming","score":0.5606786012649536},{"id":"https://openalex.org/keywords/live-streaming","display_name":"Live streaming","score":0.5051680207252502},{"id":"https://openalex.org/keywords/acoustic-streaming","display_name":"Acoustic streaming","score":0.4488431215286255},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.32486698031425476},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.29917097091674805},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.13894128799438477},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.08407562971115112},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.07527098059654236},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.05474159121513367}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7934234142303467},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.6481680274009705},{"id":"https://openalex.org/C30311675","wikidata":"https://www.wikidata.org/wiki/Q7622689","display_name":"Streaming current","level":3,"score":0.6403868198394775},{"id":"https://openalex.org/C2986160907","wikidata":"https://www.wikidata.org/wiki/Q220499","display_name":"Video streaming","level":2,"score":0.5606786012649536},{"id":"https://openalex.org/C2776741261","wikidata":"https://www.wikidata.org/wiki/Q3027665","display_name":"Live streaming","level":2,"score":0.5051680207252502},{"id":"https://openalex.org/C2779019082","wikidata":"https://www.wikidata.org/wiki/Q4674713","display_name":"Acoustic streaming","level":3,"score":0.4488431215286255},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32486698031425476},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.29917097091674805},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.13894128799438477},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.08407562971115112},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.07527098059654236},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.05474159121513367},{"id":"https://openalex.org/C81288441","wikidata":"https://www.wikidata.org/wiki/Q20736125","display_name":"Ultrasonic sensor","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C147789679","wikidata":"https://www.wikidata.org/wiki/Q11372","display_name":"Physical chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C27703432","wikidata":"https://www.wikidata.org/wiki/Q2778467","display_name":"Electrokinetic phenomena","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446229","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10446229","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W2219249508","https://openalex.org/W2842511635","https://openalex.org/W2972943112","https://openalex.org/W2996414377","https://openalex.org/W3096159803","https://openalex.org/W3097777922","https://openalex.org/W3098557217","https://openalex.org/W3101689408","https://openalex.org/W3156592906","https://openalex.org/W3197478142","https://openalex.org/W3197659778","https://openalex.org/W3197943112","https://openalex.org/W3198533616","https://openalex.org/W3203407300","https://openalex.org/W4221152438","https://openalex.org/W4296068989","https://openalex.org/W4297437784","https://openalex.org/W4297808394","https://openalex.org/W4372260214","https://openalex.org/W4372266901","https://openalex.org/W4372267354","https://openalex.org/W4372337821","https://openalex.org/W4372340947","https://openalex.org/W4372346432","https://openalex.org/W4375869015","https://openalex.org/W4375869292","https://openalex.org/W4385245566","https://openalex.org/W4385823126","https://openalex.org/W4385823331","https://openalex.org/W6688816777","https://openalex.org/W6793624898","https://openalex.org/W6843253040","https://openalex.org/W6846288163","https://openalex.org/W6846769693","https://openalex.org/W6847037835","https://openalex.org/W6847340798"],"related_works":["https://openalex.org/W2598100514","https://openalex.org/W3169846091","https://openalex.org/W3214935004","https://openalex.org/W4328029402","https://openalex.org/W2389522113","https://openalex.org/W2330360017","https://openalex.org/W2089027649","https://openalex.org/W4390321210","https://openalex.org/W4287801741","https://openalex.org/W4235833544"],"abstract_inverted_index":{"Voice":[0],"conversion":[1],"is":[2,94,124,135,156],"becoming":[3],"increasingly":[4],"popular,":[5],"and":[6,33,69,171,178],"a":[7,127,141],"growing":[8],"number":[9],"of":[10,47,149],"application":[11],"scenarios":[12],"require":[13],"models":[14],"with":[15,38,140,181],"streaming":[16,29,81],"inference":[17,72],"capabilities.":[18],"The":[19],"recently":[20],"proposed":[21],"DualVC":[22,51,114,167,170],"attempts":[23],"to":[24,42,96,116,126,145,158],"achieve":[25],"this":[26,110],"objective":[27,179],"through":[28],"model":[30,93,122],"architecture":[31],"design":[32],"intra-model":[34],"knowledge":[35],"distillation":[36],"along":[37],"hybrid":[39],"predictive":[40],"coding":[41],"compensate":[43],"for":[44],"the":[45,60,71,77,92,99,102,106,121,160],"lack":[46],"future":[48,87,151],"information.":[49,152],"However,":[50],"encounters":[52],"several":[53],"problems":[54],"that":[55,166],"limit":[56],"its":[57,67],"performance.":[58],"First,":[59],"autoregressive":[61],"decoder":[62],"has":[63],"error":[64],"accumulation":[65],"in":[66,101,175],"nature":[68],"limits":[70],"speed":[73],"as":[74],"well.":[75],"Second,":[76],"causal":[78],"convolution":[79,134,139],"enables":[80],"capability":[82],"but":[83],"cannot":[84],"sufficiently":[85],"use":[86,148],"information":[88],"within":[89],"chunks.":[90],"Third,":[91],"unable":[95],"effectively":[97],"address":[98,117],"noise":[100,162],"unvoiced":[103],"segments,":[104],"lowering":[105],"sound":[107],"quality.":[108],"In":[109],"paper,":[111],"we":[112],"propose":[113],"2":[115,168],"these":[118],"issues.":[119],"Specifically,":[120],"backbone":[123],"migrated":[125],"Conformer-based":[128],"architecture,":[129],"empowering":[130],"parallel":[131],"inference.":[132],"Causal":[133],"replaced":[136],"by":[137],"non-causal":[138],"dynamic":[142],"chunk":[143],"mask":[144],"make":[146],"better":[147],"within-chunk":[150],"Also,":[153],"quiet":[154],"attention":[155],"introduced":[157],"enhance":[159],"model\u2019s":[161],"robustness.":[163],"Experiments":[164],"show":[165],"outperforms":[169],"other":[172],"baseline":[173],"systems":[174],"both":[176],"subjective":[177],"metrics,":[180],"only":[182],"186.4":[183],"ms":[184],"latency.":[185],"Our":[186],"audio":[187],"samples":[188],"are":[189],"made":[190],"publicly":[191],"available":[192],"<sup":[193],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[194],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[195],".":[196]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
