{"id":"https://openalex.org/W4392979699","doi":"https://doi.org/10.1109/taslp.2024.3379901","title":"VoiceGrad: Non-Parallel Any-to-Many Voice Conversion With Annealed Langevin Dynamics","display_name":"VoiceGrad: Non-Parallel Any-to-Many Voice Conversion With Annealed Langevin Dynamics","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4392979699","doi":"https://doi.org/10.1109/taslp.2024.3379901"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3379901","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3379901","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001243214","display_name":"Hirokazu Kameoka","orcid":"https://orcid.org/0000-0003-3102-0162"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Hirokazu Kameoka","raw_affiliation_strings":["NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"raw_orcid":"https://orcid.org/0000-0003-3102-0162","affiliations":[{"raw_affiliation_string":"NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020693766","display_name":"Takuhiro Kaneko","orcid":"https://orcid.org/0009-0000-8016-5144"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Takuhiro Kaneko","raw_affiliation_strings":["NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"raw_orcid":"https://orcid.org/0009-0000-8016-5144","affiliations":[{"raw_affiliation_string":"NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106710403","display_name":"Kou Tanaka","orcid":"https://orcid.org/0009-0003-7107-607X"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kou Tanaka","raw_affiliation_strings":["NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"raw_orcid":"https://orcid.org/0009-0003-7107-607X","affiliations":[{"raw_affiliation_string":"NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079710814","display_name":"Nobukatsu Hojo","orcid":null},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Nobukatsu Hojo","raw_affiliation_strings":["NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018003761","display_name":"Shogo Seki","orcid":null},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shogo Seki","raw_affiliation_strings":["NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"raw_orcid":"https://orcid.org/0009-0007-3990-3740","affiliations":[{"raw_affiliation_string":"NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5001243214"],"corresponding_institution_ids":["https://openalex.org/I2251713219"],"apc_list":null,"apc_paid":null,"fwci":3.2924,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.9266327,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":"32","issue":null,"first_page":"2213","last_page":"2226"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/langevin-dynamics","display_name":"Langevin dynamics","score":0.7221687436103821},{"id":"https://openalex.org/keywords/statistical-physics","display_name":"Statistical physics","score":0.5374655723571777},{"id":"https://openalex.org/keywords/dynamics","display_name":"Dynamics (music)","score":0.5370724201202393},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.44318053126335144},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.3655230402946472},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.22914248704910278}],"concepts":[{"id":"https://openalex.org/C2780004032","wikidata":"https://www.wikidata.org/wiki/Q6485978","display_name":"Langevin dynamics","level":2,"score":0.7221687436103821},{"id":"https://openalex.org/C121864883","wikidata":"https://www.wikidata.org/wiki/Q677916","display_name":"Statistical physics","level":1,"score":0.5374655723571777},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.5370724201202393},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.44318053126335144},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.3655230402946472},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.22914248704910278}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3379901","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3379901","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":71,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1522137499","https://openalex.org/W1901129140","https://openalex.org/W2013035813","https://openalex.org/W2017742648","https://openalex.org/W2022125261","https://openalex.org/W2056852181","https://openalex.org/W2123003832","https://openalex.org/W2142300631","https://openalex.org/W2148846882","https://openalex.org/W2161727827","https://openalex.org/W2518172956","https://openalex.org/W2526425061","https://openalex.org/W2532494225","https://openalex.org/W2608377413","https://openalex.org/W2752796333","https://openalex.org/W2804998325","https://openalex.org/W2888922217","https://openalex.org/W2889061305","https://openalex.org/W2889329491","https://openalex.org/W2899877258","https://openalex.org/W2902070858","https://openalex.org/W2937579788","https://openalex.org/W2946555236","https://openalex.org/W2947196194","https://openalex.org/W2962788625","https://openalex.org/W2962793481","https://openalex.org/W2962896155","https://openalex.org/W2963444790","https://openalex.org/W2963539064","https://openalex.org/W2963767194","https://openalex.org/W2963808252","https://openalex.org/W2972544500","https://openalex.org/W2972667718","https://openalex.org/W2995181338","https://openalex.org/W2996414377","https://openalex.org/W3034420534","https://openalex.org/W3046998876","https://openalex.org/W3082130377","https://openalex.org/W3083423753","https://openalex.org/W3096567388","https://openalex.org/W3100696337","https://openalex.org/W3101689408","https://openalex.org/W3102628737","https://openalex.org/W3113687514","https://openalex.org/W3165478005","https://openalex.org/W4225956675","https://openalex.org/W4226320669","https://openalex.org/W4296068974","https://openalex.org/W4392979699","https://openalex.org/W6603838645","https://openalex.org/W6631190155","https://openalex.org/W6631943919","https://openalex.org/W6635084905","https://openalex.org/W6640963894","https://openalex.org/W6675944832","https://openalex.org/W6695676441","https://openalex.org/W6714644935","https://openalex.org/W6731370813","https://openalex.org/W6732249622","https://openalex.org/W6735913928","https://openalex.org/W6752910514","https://openalex.org/W6762533536","https://openalex.org/W6765775151","https://openalex.org/W6778946027","https://openalex.org/W6779823529","https://openalex.org/W6780218876","https://openalex.org/W6782760101","https://openalex.org/W6783867762","https://openalex.org/W6788990321","https://openalex.org/W6802527329"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2144266858","https://openalex.org/W4221165290","https://openalex.org/W3105693600","https://openalex.org/W3080947604","https://openalex.org/W325125432","https://openalex.org/W2030944229","https://openalex.org/W2973441963","https://openalex.org/W2965271035","https://openalex.org/W3103050273"],"abstract_inverted_index":{"In":[0],"this":[1,112],"paper,":[2],"we":[3],"propose":[4],"a":[5,20,45,48,53,118],"non-parallel":[6,134],"any-to-many":[7,116],"voice":[8],"conversion":[9],"(VC)":[10],"method":[11],"termed":[12],"<italic":[13],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[14],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">VoiceGrad</i>":[15],".":[16],"Inspired":[17],"by":[18,82],"WaveGrad,":[19],"recently":[21],"introduced":[22],"novel":[23],"waveform":[24],"generation":[25],"method,":[26],"VoiceGrad":[27,114],"is":[28],"based":[29],"upon":[30],"the":[31,58,61,65,99,104,109,123],"concepts":[32],"of":[33,60,64,69,103,111,125],"score":[34,46,74],"matching,":[35],"Langevin":[36,85],"dynamics,":[37],"and":[38,131],"diffusion":[39,89],"models.":[40],"The":[41,72],"idea":[42],"involves":[43],"training":[44],"approximator,":[47],"fully":[49],"convolutional":[50],"network":[51],"with":[52],"U-Net":[54],"structure,":[55],"to":[56,79,91,108],"predict":[57],"gradient":[59],"log":[62],"density":[63],"speech":[66,127],"feature":[67,96],"sequences":[68],"multiple":[70],"speakers.":[71],"trained":[73],"approximator":[75],"can":[76,128],"be":[77,129],"used":[78],"perform":[80],"VC":[81,119],"using":[83],"annealed":[84],"dynamics":[86],"or":[87],"reverse":[88],"process":[90],"iteratively":[92],"update":[93],"an":[94],"input":[95,126],"sequence":[97],"towards":[98],"nearest":[100],"stationary":[101],"point":[102],"target":[105],"distribution.":[106],"Thanks":[107],"nature":[110],"concept,":[113],"enables":[115],"VC,":[117],"scenario":[120],"in":[121],"which":[122,136],"speaker":[124],"arbitrary,":[130],"allows":[132],"for":[133],"training,":[135],"requires":[137],"no":[138],"parallel":[139],"utterances.":[140]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":2}],"updated_date":"2026-05-02T08:42:23.175194","created_date":"2025-10-10T00:00:00"}
