{"id":"https://openalex.org/W4386763495","doi":"https://doi.org/10.1109/waspaa58266.2023.10248117","title":"SLMGAN: Exploiting Speech Language Model Representations for Unsupervised Zero-Shot Voice Conversion in GANs","display_name":"SLMGAN: Exploiting Speech Language Model Representations for Unsupervised Zero-Shot Voice Conversion in GANs","publication_year":2023,"publication_date":"2023-09-15","ids":{"openalex":"https://openalex.org/W4386763495","doi":"https://doi.org/10.1109/waspaa58266.2023.10248117"},"language":"en","primary_location":{"id":"doi:10.1109/waspaa58266.2023.10248117","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/waspaa58266.2023.10248117","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023800090","display_name":"Yinghao Aaron Li","orcid":null},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yinghao Aaron Li","raw_affiliation_strings":["Columbia University,Department of Electrical Engineering,USA","Department of Electrical Engineering, Columbia University, USA"],"affiliations":[{"raw_affiliation_string":"Columbia University,Department of Electrical Engineering,USA","institution_ids":["https://openalex.org/I78577930"]},{"raw_affiliation_string":"Department of Electrical Engineering, Columbia University, USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070114472","display_name":"Cong Han","orcid":"https://orcid.org/0000-0003-2121-000X"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Cong Han","raw_affiliation_strings":["Columbia University,Department of Electrical Engineering,USA","Department of Electrical Engineering, Columbia University, USA"],"affiliations":[{"raw_affiliation_string":"Columbia University,Department of Electrical Engineering,USA","institution_ids":["https://openalex.org/I78577930"]},{"raw_affiliation_string":"Department of Electrical Engineering, Columbia University, USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5033351155","display_name":"Nima Mesgarani","orcid":"https://orcid.org/0000-0002-2987-759X"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nima Mesgarani","raw_affiliation_strings":["Columbia University,Department of Electrical Engineering,USA","Department of Electrical Engineering, Columbia University, USA"],"affiliations":[{"raw_affiliation_string":"Columbia University,Department of Electrical Engineering,USA","institution_ids":["https://openalex.org/I78577930"]},{"raw_affiliation_string":"Department of Electrical Engineering, Columbia University, USA","institution_ids":["https://openalex.org/I78577930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5023800090"],"corresponding_institution_ids":["https://openalex.org/I78577930"],"apc_list":null,"apc_paid":null,"fwci":0.3497,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.65568968,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":95},"biblio":{"volume":"34","issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9866999983787537,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9861000180244446,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.8263857364654541},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8187086582183838},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6574574708938599},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6560831069946289},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6326000690460205},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.488543838262558},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.481065034866333},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.45245301723480225},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.44741103053092957},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.43554067611694336},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4337748885154724},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.42836323380470276},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.34140321612358093},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32759541273117065}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.8263857364654541},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8187086582183838},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6574574708938599},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6560831069946289},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6326000690460205},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.488543838262558},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.481065034866333},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.45245301723480225},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44741103053092957},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.43554067611694336},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4337748885154724},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.42836323380470276},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.34140321612358093},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32759541273117065},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa58266.2023.10248117","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/waspaa58266.2023.10248117","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7200000286102295,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":43,"referenced_works":["https://openalex.org/W2593414223","https://openalex.org/W2603777577","https://openalex.org/W2768282280","https://openalex.org/W2932319787","https://openalex.org/W2945478979","https://openalex.org/W2962780374","https://openalex.org/W2998572311","https://openalex.org/W3097952294","https://openalex.org/W3098557217","https://openalex.org/W3126283728","https://openalex.org/W3163475957","https://openalex.org/W3168719651","https://openalex.org/W3175491752","https://openalex.org/W3196667132","https://openalex.org/W3197659778","https://openalex.org/W3209984917","https://openalex.org/W4224916404","https://openalex.org/W4225329057","https://openalex.org/W4225680573","https://openalex.org/W4226380987","https://openalex.org/W4281736089","https://openalex.org/W4283659485","https://openalex.org/W4296068981","https://openalex.org/W4301371414","https://openalex.org/W4307077171","https://openalex.org/W4319862431","https://openalex.org/W4320451749","https://openalex.org/W4322629454","https://openalex.org/W4375869015","https://openalex.org/W4380714711","https://openalex.org/W4383221407","https://openalex.org/W6746052068","https://openalex.org/W6762533536","https://openalex.org/W6772349387","https://openalex.org/W6790622591","https://openalex.org/W6803547063","https://openalex.org/W6805710207","https://openalex.org/W6838843145","https://openalex.org/W6839738141","https://openalex.org/W6846445453","https://openalex.org/W6848417376","https://openalex.org/W6849600165","https://openalex.org/W6853937136"],"related_works":["https://openalex.org/W4391272374","https://openalex.org/W1914543332","https://openalex.org/W2946856121","https://openalex.org/W40885451","https://openalex.org/W2108985546","https://openalex.org/W2081919107","https://openalex.org/W2433276473","https://openalex.org/W1537411440","https://openalex.org/W1984347656","https://openalex.org/W596245619"],"abstract_inverted_index":{"In":[0],"recent":[1],"years,":[2],"large-scale":[3],"pre-trained":[4],"speech":[5,16,26,35,44],"language":[6],"models":[7,126],"(SLMs)":[8],"have":[9],"demonstrated":[10],"remarkable":[11],"advancements":[12],"in":[13,99,127],"various":[14],"generative":[15,63],"modeling":[17],"applications,":[18],"such":[19],"as":[20],"text-to-speech":[21],"synthesis,":[22],"voice":[23,70,103,124],"conversion,":[24],"and":[25,131],"enhancement.":[27],"These":[28],"applications":[29],"typically":[30],"involve":[31],"mapping":[32],"text":[33,110],"or":[34],"inputs":[36],"to":[37,54],"pretrained":[38],"SLM":[39,56,93],"representations,":[40],"from":[41],"which":[42],"target":[43],"is":[45],"decoded.":[46],"This":[47],"paper":[48],"introduces":[49],"a":[50],"new":[51],"approach,":[52],"SLMGAN,":[53],"leverage":[55],"representations":[57],"for":[58,69,141],"discriminative":[59],"tasks":[60],"within":[61],"the":[62,85,136],"adversarial":[64],"network":[65],"(GAN)":[66],"framework,":[67],"specifically":[68],"conversion.":[71],"Building":[72],"upon":[73],"StarGANv2-VC,":[74],"we":[75],"add":[76],"our":[77,90],"novel":[78],"SLM-based":[79,139],"WavLM":[80],"discriminators":[81,87,140],"on":[82],"top":[83],"of":[84,129,138],"mel-based":[86],"along":[88],"with":[89],"newly":[91],"designed":[92],"feature":[94],"matching":[95],"loss":[96],"function,":[97],"resulting":[98],"an":[100],"unsupervised":[101],"zero-shot":[102,123],"conversion":[104,125],"system":[105],"that":[106,118],"does":[107],"not":[108],"require":[109],"labels":[111],"during":[112],"training.":[113],"Subjective":[114],"evaluation":[115],"results":[116],"show":[117],"SLMGAN":[119],"outperforms":[120],"existing":[121],"state-of-the-art":[122],"terms":[128],"naturalness":[130],"achieves":[132],"comparable":[133],"similarity,":[134],"highlighting":[135],"potential":[137],"related":[142],"applications.":[143]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1}],"updated_date":"2025-12-22T23:10:17.713674","created_date":"2025-10-10T00:00:00"}
