{"id":"https://openalex.org/W7148307382","doi":"https://doi.org/10.1109/asru65441.2025.11434672","title":"Maestro-EVC: Controllable Emotional Voice Conversion Guided by References and Explicit Prosody","display_name":"Maestro-EVC: Controllable Emotional Voice Conversion Guided by References and Explicit Prosody","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148307382","doi":"https://doi.org/10.1109/asru65441.2025.11434672"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434672","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434672","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002289527","display_name":"Jinsung Yoon","orcid":"https://orcid.org/0000-0002-5481-5171"},"institutions":[{"id":"https://openalex.org/I4210164862","display_name":"Artificial Intelligence in Medicine (Canada)","ror":"https://ror.org/05p590m36","country_code":"CA","type":"company","lineage":["https://openalex.org/I4210164862"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Jinsung Yoon","raw_affiliation_strings":["Graduate School of Artificial Intelligence"],"affiliations":[{"raw_affiliation_string":"Graduate School of Artificial Intelligence","institution_ids":["https://openalex.org/I4210164862"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132814231","display_name":"Wooyeol Jeong","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164862","display_name":"Artificial Intelligence in Medicine (Canada)","ror":"https://ror.org/05p590m36","country_code":"CA","type":"company","lineage":["https://openalex.org/I4210164862"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Wooyeol Jeong","raw_affiliation_strings":["Graduate School of Artificial Intelligence"],"affiliations":[{"raw_affiliation_string":"Graduate School of Artificial Intelligence","institution_ids":["https://openalex.org/I4210164862"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008235525","display_name":"Jio Gim","orcid":"https://orcid.org/0000-0002-9866-4176"},"institutions":[{"id":"https://openalex.org/I123900574","display_name":"Pohang University of Science and Technology","ror":"https://ror.org/04xysgw12","country_code":"KR","type":"education","lineage":["https://openalex.org/I123900574"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jio Gim","raw_affiliation_strings":["Pohang University of Science and Technology (POSTECH),Dept. of Computer Science and Engineering,Pohang,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Pohang University of Science and Technology (POSTECH),Dept. of Computer Science and Engineering,Pohang,Republic of Korea","institution_ids":["https://openalex.org/I123900574"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132825850","display_name":"Young-Joo Suh","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164862","display_name":"Artificial Intelligence in Medicine (Canada)","ror":"https://ror.org/05p590m36","country_code":"CA","type":"company","lineage":["https://openalex.org/I4210164862"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Young-Joo Suh","raw_affiliation_strings":["Graduate School of Artificial Intelligence"],"affiliations":[{"raw_affiliation_string":"Graduate School of Artificial Intelligence","institution_ids":["https://openalex.org/I4210164862"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5002289527"],"corresponding_institution_ids":["https://openalex.org/I4210164862"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.87339906,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5401999950408936,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5401999950408936,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.2815999984741211,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.06300000101327896,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.8375999927520752},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5365999937057495},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.5067999958992004},{"id":"https://openalex.org/keywords/emotional-prosody","display_name":"Emotional prosody","score":0.4830000102519989},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.46790000796318054},{"id":"https://openalex.org/keywords/dynamics","display_name":"Dynamics (music)","score":0.4471000134944916}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.8375999927520752},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.557699978351593},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5390999913215637},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5365999937057495},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.5067999958992004},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.4860000014305115},{"id":"https://openalex.org/C2778262033","wikidata":"https://www.wikidata.org/wiki/Q5373795","display_name":"Emotional prosody","level":3,"score":0.4830000102519989},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.46790000796318054},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.4471000134944916},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.42910000681877136},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3465999960899353},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.34540000557899475},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3109000027179718},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.2930999994277954},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C143110190","wikidata":"https://www.wikidata.org/wiki/Q5373787","display_name":"Emotional expression","level":2,"score":0.2533999979496002}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434672","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434672","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.6209906935691833,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320320671","display_name":"National Research Foundation","ror":"https://ror.org/05s0g1g46"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W388732865","https://openalex.org/W1494198834","https://openalex.org/W1574447377","https://openalex.org/W2030931454","https://openalex.org/W2109606373","https://openalex.org/W2146334809","https://openalex.org/W2471520273","https://openalex.org/W2726515241","https://openalex.org/W2955931541","https://openalex.org/W3024869864","https://openalex.org/W3096939667","https://openalex.org/W3128401974","https://openalex.org/W3162791003","https://openalex.org/W3163573274","https://openalex.org/W3197993066","https://openalex.org/W3209059054","https://openalex.org/W4205742757","https://openalex.org/W4221147462","https://openalex.org/W4240592325","https://openalex.org/W4285414384","https://openalex.org/W4286747238","https://openalex.org/W4312637065","https://openalex.org/W4372260157","https://openalex.org/W4385245566","https://openalex.org/W4385574033","https://openalex.org/W4385822877","https://openalex.org/W4391021638","https://openalex.org/W4392902857","https://openalex.org/W4392904630","https://openalex.org/W4392910613","https://openalex.org/W4402669711","https://openalex.org/W4408345780"],"related_works":[],"abstract_inverted_index":{"Emotional":[0],"voice":[1],"conversion":[2],"(EVC)":[3],"aims":[4],"to":[5,23,41,50,99],"modify":[6],"the":[7,21,48,104,108],"emotional":[8,29,53],"style":[9,30],"of":[10,70,107],"speech":[11,126],"while":[12],"preserving":[13],"its":[14],"linguistic":[15],"content.":[16],"In":[17],"practical":[18],"EVC,":[19],"controllability,":[20],"ability":[22,49],"independently":[24],"control":[25,69],"speaker":[26,72],"identity":[27],"and":[28,46,74,91,102,123],"using":[31],"distinct":[32],"references,":[33],"is":[34],"crucial.":[35],"However,":[36],"existing":[37],"methods":[38],"often":[39],"struggle":[40],"fully":[42],"disentangle":[43],"these":[44],"attributes":[45],"lack":[47],"model":[51],"fine-grained":[52],"expressions":[54],"such":[55],"as":[56],"temporal":[57,88,105],"dynamics.":[58],"We":[59,84],"propose":[60],"Maestro-EVC,":[61],"a":[62,87],"controllable":[63],"EVC":[64],"framework":[65],"that":[66,118],"enables":[67],"independent":[68],"content,":[71],"identity,":[73],"emotion":[75,89],"by":[76],"effectively":[77],"disentangling":[78],"each":[79],"attribute":[80],"from":[81],"separate":[82],"references.":[83],"further":[85],"introduce":[86],"representation":[90],"an":[92],"explicit":[93],"prosody":[94,97],"modeling":[95],"with":[96],"augmentation":[98],"robustly":[100],"capture":[101],"transfer":[103],"dynamics":[106],"target":[109],"emotion,":[110],"even":[111],"under":[112],"prosody-mismatched":[113],"conditions.":[114],"Experimental":[115],"results":[116],"confirm":[117],"Maestro-EVC":[119],"achieves":[120],"highquality,":[121],"controllable,":[122],"emotionally":[124],"expressive":[125],"synthesis.":[127]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-04-03T00:00:00"}
