{"id":"https://openalex.org/W4415708398","doi":"https://doi.org/10.1109/icme59968.2025.11209157","title":"A Progressive Generation Framework with Speech Pre-trained Model for Expressive Voice Conversion","display_name":"A Progressive Generation Framework with Speech Pre-trained Model for Expressive Voice Conversion","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415708398","doi":"https://doi.org/10.1109/icme59968.2025.11209157"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11209157","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209157","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049261843","display_name":"Tianrui Wang","orcid":"https://orcid.org/0009-0005-1517-9589"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Tianrui Wang","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071074013","display_name":"Meng Ge","orcid":"https://orcid.org/0000-0003-2017-4529"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Meng Ge","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109531461","display_name":"Zhikang Niu","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhikang Niu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Cheng Gong","orcid":null},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cheng Gong","raw_affiliation_strings":["China Telecom,Institute of Artificial Intelligence,China"],"affiliations":[{"raw_affiliation_string":"China Telecom,Institute of Artificial Intelligence,China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4210136246"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028353824","display_name":"Chunyu Qiang","orcid":"https://orcid.org/0009-0007-2290-3074"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chunyu Qiang","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100427183","display_name":"Haoyu Wang","orcid":"https://orcid.org/0000-0002-4229-0347"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoyu Wang","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zikang Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zikang Huang","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100438492","display_name":"Ziyang Ma","orcid":"https://orcid.org/0000-0002-0623-9114"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziyang Ma","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103015888","display_name":"Xiaobao Wang","orcid":"https://orcid.org/0000-0001-5086-4964"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaobao Wang","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101293966","display_name":"Xie Chen","orcid":"https://orcid.org/0000-0001-7423-617X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xie Chen","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101745212","display_name":"Longbiao Wang","orcid":"https://orcid.org/0000-0002-6970-4765"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longbiao Wang","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017251198","display_name":"Jianwu Dang","orcid":"https://orcid.org/0000-0002-9237-4821"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210145761","display_name":"Shenzhen Institutes of Advanced Technology","ror":"https://ror.org/04gh4er46","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210145761"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwu Dang","raw_affiliation_strings":["Chinese Academy of Sciences,Shenzhen Institute of Advanced Technology,Guangdong,China"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Shenzhen Institute of Advanced Technology,Guangdong,China","institution_ids":["https://openalex.org/I4210145761","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5049261843"],"corresponding_institution_ids":["https://openalex.org/I162868743"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.16148605,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6572999954223633,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6572999954223633,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.15320000052452087,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.05810000002384186,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.6464999914169312},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.5164999961853027},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4408999979496002},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.4399000108242035},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.41830000281333923},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.39989998936653137},{"id":"https://openalex.org/keywords/voice-analysis","display_name":"Voice analysis","score":0.3937000036239624},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.38029998540878296}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7253999710083008},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6832000017166138},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.6464999914169312},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.5164999961853027},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4408999979496002},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.4399000108242035},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.41830000281333923},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.40540000796318054},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.39989998936653137},{"id":"https://openalex.org/C182964821","wikidata":"https://www.wikidata.org/wiki/Q7939498","display_name":"Voice analysis","level":2,"score":0.3937000036239624},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.38029998540878296},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.349700003862381},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.34599998593330383},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.3440000116825104},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3418999910354614},{"id":"https://openalex.org/C43617652","wikidata":"https://www.wikidata.org/wiki/Q7575399","display_name":"Speech production","level":2,"score":0.3402000069618225},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.3327000141143799},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.3109999895095825},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.2955000102519989},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C54953205","wikidata":"https://www.wikidata.org/wiki/Q4142201","display_name":"Speech analytics","level":4,"score":0.28519999980926514},{"id":"https://openalex.org/C541956065","wikidata":"https://www.wikidata.org/wiki/Q2250680","display_name":"Speech error","level":3,"score":0.26010000705718994},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C2780366754","wikidata":"https://www.wikidata.org/wiki/Q7494857","display_name":"Speech translation","level":3,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11209157","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209157","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1965248225","https://openalex.org/W2545177271","https://openalex.org/W2902070858","https://openalex.org/W2962788625","https://openalex.org/W2963767194","https://openalex.org/W2972659941","https://openalex.org/W3015805741","https://openalex.org/W3044514286","https://openalex.org/W3096918678","https://openalex.org/W3197659778","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4200225196","https://openalex.org/W4226474318","https://openalex.org/W4285250921","https://openalex.org/W4286747238","https://openalex.org/W4323896824","https://openalex.org/W4372337821","https://openalex.org/W4382239830","https://openalex.org/W4386113956","https://openalex.org/W4391021367","https://openalex.org/W4392931276","https://openalex.org/W4392969497","https://openalex.org/W4400621997","https://openalex.org/W4401070302","https://openalex.org/W4402115910","https://openalex.org/W4402669711","https://openalex.org/W4409763559"],"related_works":[],"abstract_inverted_index":{"Expressive":[0],"voice":[1],"conversion":[2],"(EVC)":[3],"aims":[4],"to":[5,67,86],"modify":[6],"the":[7,32,82,135],"speaker":[8,74,95,118],"identity":[9],"and":[10,27,57,73,94,116,119],"emotional":[11,71,92],"style":[12,93],"of":[13,138],"speech":[14,38,51,64,83,89,139],"while":[15],"preserving":[16],"its":[17],"content.":[18],"Existing":[19],"approaches":[20],"often":[21],"focus":[22],"on":[23],"disentangling":[24],"speaker,":[25],"emotion,":[26],"content":[28],"information":[29],"but":[30],"overlook":[31],"progressive":[33,55],"generation":[34],"mechanisms":[35],"in":[36,114],"human":[37],"production.":[39],"To":[40],"address":[41],"this,":[42],"we":[43],"propose":[44],"a":[45,50,54,130],"three-stage":[46],"framework":[47,62,106,127],"that":[48,104],"includes":[49],"disentanglement":[52,109,136],"module,":[53],"generator,":[56],"an":[58],"acoustic":[59],"refiner.":[60],"This":[61,126],"enables":[63],"pre-trained":[65,101],"models":[66,102],"parse":[68],"linguistic":[69],"content,":[70],"style,":[72],"identity,":[75],"which":[76],"are":[77],"then":[78],"progressively":[79],"integrated":[80],"into":[81],"reconstruction":[84],"branch":[85],"generate":[87],"high-quality":[88],"with":[90,98],"replaceable":[91],"identity.":[96],"Experiments":[97],"six":[99],"different":[100,123],"show":[103],"our":[105],"activates":[107],"their":[108],"capabilities,":[110],"surpassing":[111],"baseline":[112],"performance":[113],"EVC,":[115],"supports":[117],"emotion":[120],"control":[121],"from":[122],"target":[124],"samples.":[125],"also":[128],"provides":[129],"valuable":[131],"reference":[132],"for":[133],"evaluating":[134],"capabilities":[137],"pre-training":[140],"models.":[141]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-30T00:00:00"}
