{"id":"https://openalex.org/W4392969497","doi":"https://doi.org/10.1109/icassp48485.2024.10447984","title":"Wav2vec-VC: Voice Conversion via Hidden Representations of Wav2vec 2.0","display_name":"Wav2vec-VC: Voice Conversion via Hidden Representations of Wav2vec 2.0","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392969497","doi":"https://doi.org/10.1109/icassp48485.2024.10447984"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10447984","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447984","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114137695","display_name":"Jaemin Lim","orcid":null},"institutions":[{"id":"https://openalex.org/I4210160331","display_name":"Neoleukin Therapeutics (United States)","ror":"https://ror.org/04r0bh641","country_code":"US","type":"company","lineage":["https://openalex.org/I4210160331"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jaemin Lim","raw_affiliation_strings":["PrairieSchooner Inc,Republic of Korea","PrairieSchooner Inc, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"PrairieSchooner Inc,Republic of Korea","institution_ids":["https://openalex.org/I4210160331"]},{"raw_affiliation_string":"PrairieSchooner Inc, Republic of Korea","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5074323842","display_name":"Kiyeon Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I4210160331","display_name":"Neoleukin Therapeutics (United States)","ror":"https://ror.org/04r0bh641","country_code":"US","type":"company","lineage":["https://openalex.org/I4210160331"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kiyeon Kim","raw_affiliation_strings":["PrairieSchooner Inc,Republic of Korea","PrairieSchooner Inc, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"PrairieSchooner Inc,Republic of Korea","institution_ids":["https://openalex.org/I4210160331"]},{"raw_affiliation_string":"PrairieSchooner Inc, Republic of Korea","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.582,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.95135683,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"10326","last_page":"10330"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.992900013923645,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.7964160442352295},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.744113028049469},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.741463303565979},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6209938526153564},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5677769780158997},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5414294600486755},{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.46993929147720337},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.42377960681915283},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.36656317114830017},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3539581894874573}],"concepts":[{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.7964160442352295},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.744113028049469},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.741463303565979},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6209938526153564},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5677769780158997},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5414294600486755},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.46993929147720337},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.42377960681915283},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.36656317114830017},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3539581894874573},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10447984","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447984","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W2603777577","https://openalex.org/W2842511635","https://openalex.org/W2954386831","https://openalex.org/W2972359262","https://openalex.org/W2972659941","https://openalex.org/W2990124956","https://openalex.org/W3015434413","https://openalex.org/W3016011332","https://openalex.org/W3036601975","https://openalex.org/W3096524539","https://openalex.org/W3121914243","https://openalex.org/W3161627112","https://openalex.org/W3163475957","https://openalex.org/W3168719651","https://openalex.org/W3197580070","https://openalex.org/W3197659778","https://openalex.org/W3197763626","https://openalex.org/W3198275944","https://openalex.org/W3207558756","https://openalex.org/W3209059054","https://openalex.org/W4221146610","https://openalex.org/W4226380987","https://openalex.org/W4287173589","https://openalex.org/W4297808394","https://openalex.org/W6603838645","https://openalex.org/W6762533536","https://openalex.org/W6764876209","https://openalex.org/W6770838157","https://openalex.org/W6780218876","https://openalex.org/W6787335539","https://openalex.org/W6788328058","https://openalex.org/W6795952400"],"related_works":["https://openalex.org/W2529301793","https://openalex.org/W2384121599","https://openalex.org/W2038083449","https://openalex.org/W3177678247","https://openalex.org/W1999617572","https://openalex.org/W2944572343","https://openalex.org/W2333799855","https://openalex.org/W2351687372","https://openalex.org/W2127461790","https://openalex.org/W2069324367"],"abstract_inverted_index":{"This":[0],"paper":[1],"describes":[2],"an":[3,98],"unconventional":[4],"way":[5],"to":[6,87,105,114,126],"use":[7],"wav2vec":[8,25,81],"2.0":[9,26,82],"representations":[10,23,45,84],"for":[11,68],"voice":[12,144],"conversion":[13],"(VC)":[14],"purpose.":[15],"Our":[16,131],"experiment":[17],"shows":[18,133],"that":[19,79,134],"aggregate":[20,41],"of":[21,42,142],"hidden":[22,44,100],"from":[24],"layers":[27],"is":[28,46,121],"more":[29],"effective":[30],"in":[31,103,140],"VC":[32,138],"than":[33],"using":[34],"last-layer":[35],"representation":[36,101],"only.":[37],"In":[38],"particular,":[39],"the":[40,69,88,107],"all":[43],"dependent":[47],"on":[48,65,73],"a":[49,58,92,115,124,128],"mainly":[50],"required":[51],"characteristic":[52],"(e.g.":[53],"vocal":[54],"or":[55],"linguistic)":[56],"by":[57],"speech":[59,147],"task\u2014but":[60],"such":[61],"results":[62],"are":[63],"consistent":[64],"different":[66],"datasets":[67],"same":[70],"task.":[71],"Based":[72],"these":[74],"results,":[75],"we":[76],"propose":[77],"Wav2vec-VC":[78,96,135],"uses":[80],"hidden-layer":[83],"as":[85],"input":[86],"disentanglement-based":[89],"VC.":[90],"Given":[91],"target":[93],"(/source)":[94],"utterance,":[95],"gets":[97],"aggregated":[99],"weighted":[102],"order":[104],"perform":[106],"speaker":[108,116],"(/content)-related":[109],"tasks,":[110],"and":[111,146],"feeds":[112],"it":[113],"(/content)":[117],"encoder":[118],"whose":[119],"output":[120],"combined":[122],"at":[123],"decoder":[125],"synthesize":[127],"voice-converted":[129],"utterance.":[130],"evaluation":[132],"outperforms":[136],"SOTA":[137],"models":[139],"terms":[141],"both":[143],"similarity":[145],"intelligibility.":[148]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":7}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
