{"id":"https://openalex.org/W3005420900","doi":"https://doi.org/10.1109/ijcnn48605.2020.9207653","title":"Vocoder-free End-to-End Voice Conversion with Transformer Network","display_name":"Vocoder-free End-to-End Voice Conversion with Transformer Network","publication_year":2020,"publication_date":"2020-07-01","ids":{"openalex":"https://openalex.org/W3005420900","doi":"https://doi.org/10.1109/ijcnn48605.2020.9207653","mag":"3005420900"},"language":"en","primary_location":{"id":"doi:10.1109/ijcnn48605.2020.9207653","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn48605.2020.9207653","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2002.03808","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"June-Woo Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I31419693","display_name":"Kyungpook National University","ror":"https://ror.org/040c17130","country_code":"KR","type":"education","lineage":["https://openalex.org/I31419693"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"June-Woo Kim","raw_affiliation_strings":["Department of Sensor and Display Engineering, Kyungpook National University, Daegu, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Sensor and Display Engineering, Kyungpook National University, Daegu, Republic of Korea","institution_ids":["https://openalex.org/I31419693"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Ho-Young Jung","orcid":null},"institutions":[{"id":"https://openalex.org/I31419693","display_name":"Kyungpook National University","ror":"https://ror.org/040c17130","country_code":"KR","type":"education","lineage":["https://openalex.org/I31419693"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Ho-Young Jung","raw_affiliation_strings":["Department of Sensor and Display Engineering, Kyungpook National University, Daegu, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Sensor and Display Engineering, Kyungpook National University, Daegu, Republic of Korea","institution_ids":["https://openalex.org/I31419693"]}]},{"author_position":"last","author":{"id":null,"display_name":"Minho Lee","orcid":null},"institutions":[{"id":"https://openalex.org/I31419693","display_name":"Kyungpook National University","ror":"https://ror.org/040c17130","country_code":"KR","type":"education","lineage":["https://openalex.org/I31419693"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Minho Lee","raw_affiliation_strings":["Department of Sensor and Display Engineering, Kyungpook National University, Daegu, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Sensor and Display Engineering, Kyungpook National University, Daegu, Republic of Korea","institution_ids":["https://openalex.org/I31419693"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.4062,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.68193031,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7443000078201294},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5669999718666077},{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.4641000032424927},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.4189000129699707},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4162999987602234},{"id":"https://openalex.org/keywords/filter-bank","display_name":"Filter bank","score":0.3702999949455261},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.36480000615119934}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7635999917984009},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7443000078201294},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6180999875068665},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5669999718666077},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.4641000032424927},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.4189000129699707},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4162999987602234},{"id":"https://openalex.org/C100515483","wikidata":"https://www.wikidata.org/wiki/Q3268235","display_name":"Filter bank","level":3,"score":0.3702999949455261},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.36480000615119934},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36329999566078186},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.31790000200271606},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.30489999055862427},{"id":"https://openalex.org/C198386975","wikidata":"https://www.wikidata.org/wiki/Q117785","display_name":"Finite impulse response","level":2,"score":0.2976999878883362},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.28949999809265137},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/ijcnn48605.2020.9207653","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn48605.2020.9207653","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2002.03808","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2002.03808","pdf_url":"https://arxiv.org/pdf/2002.03808","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2002.03808","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2002.03808","pdf_url":"https://arxiv.org/pdf/2002.03808","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1832693441","https://openalex.org/W2064675550","https://openalex.org/W2120847449","https://openalex.org/W2131774270","https://openalex.org/W2194775991","https://openalex.org/W2903739847","https://openalex.org/W2913340405","https://openalex.org/W2963300588","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2972495969","https://openalex.org/W2972970915","https://openalex.org/W2981851019","https://openalex.org/W6600593648","https://openalex.org/W6631190155","https://openalex.org/W6637242042","https://openalex.org/W6640212811","https://openalex.org/W6674330103","https://openalex.org/W6678885109","https://openalex.org/W6679436768","https://openalex.org/W6738884980","https://openalex.org/W6739901393","https://openalex.org/W6748409065","https://openalex.org/W6755207826","https://openalex.org/W6771070734","https://openalex.org/W6843673214"],"related_works":[],"abstract_inverted_index":{"Mel-frequency":[0],"filter":[1],"bank":[2],"(MFB)":[3],"based":[4],"approaches":[5],"have":[6,111],"the":[7,16,30,40,49,52,68,97,119,126,132,178,182,205],"advantage":[8],"of":[9,24,75,121,128,131,161],"higher":[10],"learning":[11,122],"speeds":[12],"compared":[13],"to":[14,20,57,65,71,95,146,157],"using":[15,91,150,204],"raw":[17,69,151],"spectrum":[18,70],"due":[19],"a":[21,85,92,141,154],"smaller":[22],"number":[23],"features.":[25],"However,":[26],"speech":[27,174],"generators":[28],"with":[29,77,163,181,210],"MFB":[31,50,172],"approach":[32,145],"require":[33],"an":[34,170],"additional":[35,101],"computationally":[36],"expensive":[37],"vocoder":[38,53],"for":[39,173,193],"training":[41],"process.":[42],"The":[43],"pre-":[44,102],"and":[45,51,103,143,185,208],"post-processing":[46],"needed":[47],"by":[48,176],"is":[54,63,140,190],"not":[55,110],"essential":[56],"convert":[58,147],"human":[59],"voices,":[60],"because":[61],"it":[62],"possible":[64],"use":[66],"only":[67],"generate":[72,158],"different":[73,159],"style":[74,160],"voices":[76,149,162],"clear":[78,164],"pronunciation.":[79,165],"In":[80],"this":[81,136],"paper,":[82],"we":[83,167],"introduce":[84],"vocoder-free":[86],"end-to-end":[87],"voice":[88,199],"conversion":[89,188,200],"method":[90],"transformer":[93],"network":[94],"alleviate":[96],"computational":[98],"burden":[99],"from":[100],"post-processing.":[104],"Our":[105],"transformer-based":[106],"architecture,":[107],"which":[108],"does":[109],"any":[112],"CNN":[113],"or":[114],"RNN":[115],"layers,":[116],"has":[117],"shown":[118],"benefit":[120],"fast":[123,142],"while":[124],"solving":[125],"limitation":[127],"sequential":[129],"computation":[130],"conventional":[133],"RNN.":[134],"For":[135],"reason,":[137],"our":[138,187,198],"model":[139,189],"effective":[144],"realistic":[148],"spectra":[152],"in":[153],"parallel":[155],"manner":[156],"Furthermore,":[166],"can":[168],"get":[169],"adapted":[171],"recognition":[175],"multiplying":[177],"converted":[179],"magnitude":[180],"phase":[183],"information,":[184],"therefore":[186],"also":[191],"suitable":[192],"speaker":[194],"adaptation.":[195],"We":[196],"perform":[197],"experiments":[201],"on":[202],"TIDIGITS-dataset":[203],"naturalness,":[206],"similarity,":[207],"clarity":[209],"Mean":[211],"Opinion":[212],"Score":[213],"as":[214],"metrics":[215],"<sup":[216],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[217],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[218],".":[219]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2020-02-14T00:00:00"}
