{"id":"https://openalex.org/W4361791061","doi":"https://doi.org/10.1145/3577530.3577575","title":"High Quality and Similarity One-Shot Voice Conversion Using End-to-End Model","display_name":"High Quality and Similarity One-Shot Voice Conversion Using End-to-End Model","publication_year":2022,"publication_date":"2022-12-09","ids":{"openalex":"https://openalex.org/W4361791061","doi":"https://doi.org/10.1145/3577530.3577575"},"language":"en","primary_location":{"id":"doi:10.1145/3577530.3577575","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3577530.3577575","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2022 6th International Conference on Computer Science and Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5090584400","display_name":"Renmingyue Du","orcid":null},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Renmingyue Du","raw_affiliation_strings":["School of Advanced Technology, Xi'an Jiaotong-liverpool University, China"],"raw_orcid":"https://orcid.org/0000-0002-3116-1961","affiliations":[{"raw_affiliation_string":"School of Advanced Technology, Xi'an Jiaotong-liverpool University, China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5015560758","display_name":"Jixun Yao","orcid":"https://orcid.org/0000-0002-5324-7360"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jixun Yao","raw_affiliation_strings":["School of Computer, Northwestern Polytechnical University, China"],"raw_orcid":"https://orcid.org/0000-0002-5324-7360","affiliations":[{"raw_affiliation_string":"School of Computer, Northwestern Polytechnical University, China","institution_ids":["https://openalex.org/I17145004"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18274112,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"284","last_page":"288"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9932000041007996,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8125315308570862},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.6961338520050049},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.688248872756958},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6493445634841919},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5953011512756348},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.5445525646209717},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5231344103813171},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4813550114631653},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.47533750534057617},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4520185589790344},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.44201499223709106},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.43028295040130615},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3809557259082794},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.061666637659072876}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8125315308570862},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.6961338520050049},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.688248872756958},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6493445634841919},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5953011512756348},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.5445525646209717},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5231344103813171},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4813550114631653},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.47533750534057617},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4520185589790344},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.44201499223709106},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.43028295040130615},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3809557259082794},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.061666637659072876},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3577530.3577575","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3577530.3577575","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2022 6th International Conference on Computer Science and Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Affordable and clean energy","score":0.5799999833106995,"id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W2747744257","https://openalex.org/W2902070858","https://openalex.org/W2946555236","https://openalex.org/W2962788625","https://openalex.org/W2972659941","https://openalex.org/W2972667718","https://openalex.org/W2973032144","https://openalex.org/W3012437242","https://openalex.org/W3015434413","https://openalex.org/W3015805741","https://openalex.org/W3092028330","https://openalex.org/W3098557217","https://openalex.org/W3168719651","https://openalex.org/W3203407300"],"related_works":["https://openalex.org/W2164147372","https://openalex.org/W642007152","https://openalex.org/W2550171623","https://openalex.org/W4253660971","https://openalex.org/W2401827384","https://openalex.org/W2355290951","https://openalex.org/W4304187160","https://openalex.org/W3126788496","https://openalex.org/W1909292483","https://openalex.org/W596245619"],"abstract_inverted_index":{"Voice":[0],"Conversion":[1],"(VC)":[2],"is":[3],"becoming":[4],"increasingly":[5],"popular":[6],"in":[7,39,138],"speech":[8,79],"synthesis":[9],"applications.":[10],"Most":[11,56],"methods":[12],"focus":[13],"on":[14,45,61],"many-to-many":[15],"VC":[16,27],"which":[17,64],"can":[18,35,65,120],"not":[19],"be":[20,36],"used":[21],"for":[22,95],"unseen":[23,38],"speakers.":[24],"One-shot":[25],"(any-to-any)":[26],"allows":[28],"the":[29,32,40,70,74,77],"source":[30],"and":[31,53,73,76,115,124],"target":[33],"speakers":[34],"both":[37],"inference":[41],"phase.":[42],"This":[43],"relies":[44],"an":[46],"additional":[47],"model":[48,72],"to":[49,67,111],"disengage":[50],"linguistic":[51],"information":[52],"speaker":[54,117],"information.":[55],"previous":[57],"works":[58],"were":[59],"based":[60],"two-stage":[62],"VC,":[63],"lead":[66],"mismatches":[68],"between":[69],"acoustic":[71],"vocoder,":[75],"generated":[78],"has":[80],"poor":[81],"quality":[82,123],"or":[83],"similarity.":[84],"In":[85],"this":[86],"work,":[87],"we":[88,103],"proposed":[89,132],"a":[90,105],"novel":[91],"method":[92,133],"trained":[93],"end-to-end":[94],"one-shot":[96,101],"voice":[97],"conversion.":[98,126],"Unlike":[99],"other":[100],"methods,":[102],"use":[104],"combination":[106],"of":[107],"multiple":[108],"ASV":[109],"models":[110],"obtain":[112],"more":[113],"accurate":[114],"robust":[116],"embedding":[118],"that":[119,130],"achieve":[121],"high":[122],"similarity":[125],"Experiment":[127],"results":[128],"demonstrate":[129],"our":[131],"outperforms":[134],"all":[135],"considered":[136],"baselines":[137],"different":[139],"gender":[140],"setups.":[141]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
