{"id":"https://openalex.org/W4386132131","doi":"https://doi.org/10.1109/lsp.2023.3308474","title":"LM-VC: Zero-Shot Voice Conversion via Speech Generation Based on Language Models","display_name":"LM-VC: Zero-Shot Voice Conversion via Speech Generation Based on Language Models","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4386132131","doi":"https://doi.org/10.1109/lsp.2023.3308474"},"language":"en","primary_location":{"id":"doi:10.1109/lsp.2023.3308474","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2023.3308474","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106698700","display_name":"Zhichao Wang","orcid":"https://orcid.org/0000-0001-8075-1784"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhichao Wang","raw_affiliation_strings":["ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055175414","display_name":"Yuanzhe Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuanzhe Chen","raw_affiliation_strings":["ByteDance SAMI Group, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance SAMI Group, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"ASLP Lab, School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018710096","display_name":"Qiao Tian","orcid":"https://orcid.org/0000-0001-8177-7724"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao Tian","raw_affiliation_strings":["ByteDance SAMI Group, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance SAMI Group, Shanghai, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100339106","display_name":"Yu\u2010Ping Wang","orcid":"https://orcid.org/0000-0001-9340-5864"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuping Wang","raw_affiliation_strings":["ByteDance SAMI Group, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance SAMI Group, Shanghai, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5106698700"],"corresponding_institution_ids":["https://openalex.org/I17145004"],"apc_list":null,"apc_paid":null,"fwci":3.6497,"has_fulltext":false,"cited_by_count":21,"citation_normalized_percentile":{"value":0.94409505,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"30","issue":null,"first_page":"1157","last_page":"1161"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.9936000108718872,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8143818378448486},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6690618991851807},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.44885537028312683},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.42603519558906555},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.4190725088119507},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.39393725991249084},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3933229148387909},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3764346241950989},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.3402567505836487}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8143818378448486},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6690618991851807},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.44885537028312683},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.42603519558906555},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.4190725088119507},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39393725991249084},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3933229148387909},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3764346241950989},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.3402567505836487}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lsp.2023.3308474","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2023.3308474","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4099999964237213,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":50,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1572989473","https://openalex.org/W1915251500","https://openalex.org/W2091318760","https://openalex.org/W2518172956","https://openalex.org/W2972359262","https://openalex.org/W2972659941","https://openalex.org/W3024869864","https://openalex.org/W3138516171","https://openalex.org/W3152740956","https://openalex.org/W3154308281","https://openalex.org/W3161627112","https://openalex.org/W3162390194","https://openalex.org/W3197358873","https://openalex.org/W3197659778","https://openalex.org/W3198082505","https://openalex.org/W3202267900","https://openalex.org/W3209059054","https://openalex.org/W3215615641","https://openalex.org/W4221141917","https://openalex.org/W4285294723","https://openalex.org/W4296069138","https://openalex.org/W4297808394","https://openalex.org/W4313679638","https://openalex.org/W4318351475","https://openalex.org/W4376632433","https://openalex.org/W4376632463","https://openalex.org/W4381786045","https://openalex.org/W4384648639","https://openalex.org/W4385245566","https://openalex.org/W4390075359","https://openalex.org/W6603838645","https://openalex.org/W6634186343","https://openalex.org/W6640059789","https://openalex.org/W6762122294","https://openalex.org/W6762533536","https://openalex.org/W6776390925","https://openalex.org/W6780218876","https://openalex.org/W6794185780","https://openalex.org/W6802029025","https://openalex.org/W6803547063","https://openalex.org/W6805710207","https://openalex.org/W6810447587","https://openalex.org/W6844194202","https://openalex.org/W6848735303","https://openalex.org/W6849105126","https://openalex.org/W6852503157","https://openalex.org/W6852638271","https://openalex.org/W6854308872","https://openalex.org/W6936113694"],"related_works":["https://openalex.org/W2164147372","https://openalex.org/W4253660971","https://openalex.org/W2550171623","https://openalex.org/W596245619","https://openalex.org/W642007152","https://openalex.org/W2131711534","https://openalex.org/W2184127972","https://openalex.org/W2343205865","https://openalex.org/W2341426843","https://openalex.org/W2009814707"],"abstract_inverted_index":{"Language":[0],"model":[1,220],"(LM)":[2],"based":[3,236],"audio":[4,17],"generation":[5,131],"frameworks,":[6],"e.g.,":[7],"AudioLM,":[8],"have":[9],"recently":[10],"achieved":[11],"new":[12],"state-of-the-art":[13],"performance":[14],"in":[15,84,98,133,253,273,294,305],"zero-shot":[16],"generation.":[18],"In":[19],"this":[20],"paper,":[21],"we":[22,158],"explore":[23],"the":[24,69,80,94,99,109,121,125,130,134,137,176,187,225,229,238,250,254,265,274,290,295],"feasibility":[25],"of":[26,68,136],"LMs":[27],"for":[28,174,189,215],"<italic":[29,160],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[30,161],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">zero-shot":[31],"voice":[32,100],"conversion</i>":[33],".":[34],"An":[35],"intuitive":[36],"approach":[37,75,168],"is":[38,213,221,269],"to":[39,60,120,141,147,196,223,247,263,271],"follow":[40],"AudioLM":[41],"\u2013":[42],"Tokenizing":[43],"speech":[44,96,151,235,306],"into":[45],"semantic":[46,58,85,110,244],"and":[47,53,55,150,180,184,200,232,242,292,308],"acoustic":[48,62,66,172,190,217,267,276,287],"tokens":[49,59,63,67,86,111,173,288],"respectively":[50],"by":[51],"HuBERT":[52],"SoundStream,":[54],"converting":[56],"source":[57,177],"target":[61,70,122,126,181,234,239],"conditioned":[64],"on":[65,237],"speaker.":[71],"However,":[72],"such":[73],"an":[74,256],"encounters":[76],"several":[77],"issues:":[78],"1)":[79],"linguistic":[81,178],"content":[82,179,198,227],"contained":[83],"may":[87,117],"get":[88],"dispersed":[89],"during":[90,144],"multi-layer":[91],"modeling":[92,167,277],"while":[93],"lengthy":[95],"input":[97],"conversion":[101],"task":[102],"makes":[103],"contextual":[104],"learning":[105],"even":[106],"harder;":[107],"2)":[108],"still":[112],"contain":[113],"speaker-related":[114],"information,":[115],"which":[116,259],"be":[118],"leaked":[119],"speech,":[123],"lowering":[124],"speaker":[127,309],"similarity;":[128],"3)":[129],"diversity":[132],"sampling":[135,251],"LM":[138,207,284],"can":[139],"lead":[140],"unexpected":[142],"outcomes":[143],"inference,":[145],"leading":[146],"unnatural":[148],"pronunciation":[149],"quality":[152],"degradation.":[153],"To":[154],"mitigate":[155],"these":[156],"problems,":[157],"propose":[159],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">LM-VC</i>":[162],",":[163],"a":[164,204,209,282],"two-stage":[165],"language":[166],"that":[169,300],"generates":[170],"coarse":[171,216,275,291],"recovering":[175],"speaker's":[182,240],"timbre,":[183],"then":[185],"reconstructs":[186,285],"fine":[188,286],"details":[191],"as":[192],"converted":[193,296],"speech.":[194,297],"Specifically,":[195],"enhance":[197],"preservation":[199],"facilitates":[201],"better":[202],"disentanglement,":[203],"masked":[205,226],"prefix":[206,283],"with":[208],"mask":[210],"prediction":[211],"strategy":[212],"used":[214],"modeling.":[218],"This":[219],"encouraged":[222],"recover":[224],"from":[228,289],"surrounding":[230],"context":[231],"generate":[233],"utterance":[241],"corrupted":[243],"tokens.":[245],"Besides,":[246],"further":[248],"alleviate":[249],"error":[252],"generation,":[255],"external":[257],"LM,":[258],"employs":[260],"window":[261],"attention":[262],"capture":[264],"local":[266],"relations,":[268],"introduced":[270],"participate":[272],"through":[278],"shallow":[279],"fusion.":[280],"Finally,":[281],"results":[293],"Experiments":[298],"demonstrate":[299],"LM-VC":[301],"outperforms":[302],"competitive":[303],"systems":[304],"naturalness":[307],"similarity.":[310]},"counts_by_year":[{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":11}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
