{"id":"https://openalex.org/W7148585185","doi":"https://doi.org/10.1109/asru65441.2025.11434632","title":"REF-VC: Robust, Expressive and Fast Zero-Shot Voice Conversion with Diffusion Transformers","display_name":"REF-VC: Robust, Expressive and Fast Zero-Shot Voice Conversion with Diffusion Transformers","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148585185","doi":"https://doi.org/10.1109/asru65441.2025.11434632"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434632","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434632","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109786295","display_name":"Yuepeng Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]},{"id":"https://openalex.org/I99418890","display_name":"Northwestern Polytechnic University","ror":"https://ror.org/05wn69s11","country_code":"US","type":"education","lineage":["https://openalex.org/I99418890"]}],"countries":["CN","US"],"is_corresponding":true,"raw_author_name":"Yuepeng Jiang","raw_affiliation_strings":["Northwestern Polytechnical University,Audio Speech and Language Processing Group (ASLP@NPU) School of Software,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio Speech and Language Processing Group (ASLP@NPU) School of Software,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004","https://openalex.org/I99418890"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081164682","display_name":"Ziqian Ning","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]},{"id":"https://openalex.org/I99418890","display_name":"Northwestern Polytechnic University","ror":"https://ror.org/05wn69s11","country_code":"US","type":"education","lineage":["https://openalex.org/I99418890"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Ziqian Ning","raw_affiliation_strings":["Northwestern Polytechnical University,Audio Speech and Language Processing Group (ASLP@NPU) School of Software,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio Speech and Language Processing Group (ASLP@NPU) School of Software,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004","https://openalex.org/I99418890"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132829656","display_name":"Shuai Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I308837","display_name":"Suzhou University of Science and Technology","ror":"https://ror.org/04en8wb91","country_code":"CN","type":"education","lineage":["https://openalex.org/I308837"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuai Wang","raw_affiliation_strings":["Nanjing University Hong Kong,School of Intelligence Science and Technology,Suzhou,China"],"affiliations":[{"raw_affiliation_string":"Nanjing University Hong Kong,School of Intelligence Science and Technology,Suzhou,China","institution_ids":["https://openalex.org/I308837"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082176355","display_name":"Chengjia Wang","orcid":"https://orcid.org/0000-0003-2345-7364"},"institutions":[{"id":"https://openalex.org/I4210091137","display_name":"NetEase (China)","ror":"https://ror.org/00fp6fj05","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210091137"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chengjia Wang","raw_affiliation_strings":["NetEase,Fuxi AI Lab,China"],"affiliations":[{"raw_affiliation_string":"NetEase,Fuxi AI Lab,China","institution_ids":["https://openalex.org/I4210091137"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036369578","display_name":"Mengxiao Bi","orcid":"https://orcid.org/0009-0007-6680-481X"},"institutions":[{"id":"https://openalex.org/I4210091137","display_name":"NetEase (China)","ror":"https://ror.org/00fp6fj05","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210091137"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengxiao Bi","raw_affiliation_strings":["NetEase,Fuxi AI Lab,China"],"affiliations":[{"raw_affiliation_string":"NetEase,Fuxi AI Lab,China","institution_ids":["https://openalex.org/I4210091137"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132813845","display_name":"Pengcheng Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210153393","display_name":"Geely (China)","ror":"https://ror.org/0446d5v35","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210153393"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengcheng Zhu","raw_affiliation_strings":["Geely,China"],"affiliations":[{"raw_affiliation_string":"Geely,China","institution_ids":["https://openalex.org/I4210153393"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132792893","display_name":"Zhonghua Fu","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]},{"id":"https://openalex.org/I99418890","display_name":"Northwestern Polytechnic University","ror":"https://ror.org/05wn69s11","country_code":"US","type":"education","lineage":["https://openalex.org/I99418890"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Zhonghua Fu","raw_affiliation_strings":["Northwestern Polytechnical University,Audio Speech and Language Processing Group (ASLP@NPU) School of Software,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio Speech and Language Processing Group (ASLP@NPU) School of Software,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004","https://openalex.org/I99418890"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132808273","display_name":"Lei Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]},{"id":"https://openalex.org/I99418890","display_name":"Northwestern Polytechnic University","ror":"https://ror.org/05wn69s11","country_code":"US","type":"education","lineage":["https://openalex.org/I99418890"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Northwestern Polytechnical University,Audio Speech and Language Processing Group (ASLP@NPU) School of Software,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio Speech and Language Processing Group (ASLP@NPU) School of Software,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004","https://openalex.org/I99418890"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5109786295"],"corresponding_institution_ids":["https://openalex.org/I17145004","https://openalex.org/I99418890"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.87579944,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.871999979019165,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.871999979019165,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.031599998474121094,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.00839999970048666,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6912999749183655},{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.5530999898910522},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5508999824523926},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.4553999900817871},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4065999984741211},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.40119999647140503},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.3659000098705292}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7448999881744385},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6912999749183655},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6822999715805054},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.5530999898910522},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5508999824523926},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.4553999900817871},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4065999984741211},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.40119999647140503},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.3659000098705292},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.33059999346733093},{"id":"https://openalex.org/C2777185736","wikidata":"https://www.wikidata.org/wiki/Q7265603","display_name":"QUIET","level":2,"score":0.3160000145435333},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.29440000653266907},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.2879999876022339},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.2687999904155731},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C24326235","wikidata":"https://www.wikidata.org/wiki/Q126095","display_name":"Electronic engineering","level":1,"score":0.2624000012874603},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.25839999318122864},{"id":"https://openalex.org/C86781634","wikidata":"https://www.wikidata.org/wiki/Q2478325","display_name":"Environmental noise","level":3,"score":0.2558000087738037},{"id":"https://openalex.org/C34146451","wikidata":"https://www.wikidata.org/wiki/Q5048094","display_name":"Cascade","level":2,"score":0.25459998846054077},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2529999911785126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434632","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434632","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W1992475611","https://openalex.org/W2518172956","https://openalex.org/W3083423753","https://openalex.org/W3135654121","https://openalex.org/W3197763626","https://openalex.org/W3203407300","https://openalex.org/W3206503703","https://openalex.org/W3209984917","https://openalex.org/W4205582447","https://openalex.org/W4221167707","https://openalex.org/W4372338328","https://openalex.org/W4385823126","https://openalex.org/W4390815727","https://openalex.org/W4390872297","https://openalex.org/W4391021798","https://openalex.org/W4402112067","https://openalex.org/W4406107190","https://openalex.org/W4406461672","https://openalex.org/W4406461681","https://openalex.org/W4409363098","https://openalex.org/W4412945617"],"related_works":[],"abstract_inverted_index":{"In":[0,127],"real-world":[1],"voice":[2,49,135],"conversion":[3,50,136],"applications,":[4],"environmental":[5],"noise":[6,23,40,70],"in":[7,66,110],"source":[8],"speech":[9],"and":[10,39,72],"user":[11],"demands":[12],"for":[13],"expressive":[14,48],"output":[15],"pose":[16],"critical":[17],"challenges.":[18],"Traditional":[19],"ASR-based":[20],"methods":[21],"ensure":[22],"robustness":[24,71],"but":[25,34],"suppress":[26,81],"prosody":[27],"richness,":[28],"while":[29,117],"SSL-based":[30],"models":[31],"improve":[32],"expressiveness":[33],"suffer":[35],"from":[36],"timbre":[37],"leakage":[38],"sensitivity.":[41],"This":[42],"paper":[43],"proposes":[44],"REF-VC,":[45],"a":[46],"noise-robust":[47],"system.":[51],"Key":[52],"innovations":[53],"include:":[54],"(1)":[55],"A":[56],"random":[57],"erasing":[58],"strategy":[59],"to":[60,80,90,97,121],"mitigate":[61],"the":[62,114,124],"information":[63],"redundancy":[64],"inherent":[65],"SSL":[67],"features,":[68],"enhancing":[69],"expressiveness;":[73],"(2)":[74],"Implicit":[75],"alignment":[76],"inspired":[77],"by":[78],"E2TTS":[79],"non-essential":[82],"feature":[83],"reconstruction;":[84],"(3)":[85],"Integration":[86],"of":[87],"Shortcut":[88],"Models":[89],"accelerate":[91],"flow":[92],"matching":[93],"inference,":[94],"significantly":[95],"reducing":[96],"4":[98],"steps.":[99],"Experimental":[100],"results":[101],"demonstrate":[102],"that":[103],"REF-VC":[104,129],"outperforms":[105],"baselines":[106],"such":[107],"as":[108],"Seed-VC":[109,122],"zero-shot":[111],"scenarios":[112],"on":[113,123],"noisy":[115],"set,":[116],"also":[118],"performing":[119],"comparably":[120],"clean":[125],"set.":[126],"addition,":[128],"can":[130,142],"be":[131,143],"compatible":[132],"with":[133],"singing":[134],"within":[137],"one":[138],"model.":[139],"The":[140],"samples":[141],"found":[144],"at:":[145],"https://rxyj.github.io/asru2025/":[146]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
