{"id":"https://openalex.org/W4407900542","doi":"https://doi.org/10.1109/tnnls.2025.3534822","title":"Disentanglement of Prosody Representations via Diffusion Models and Scheduled Gradient Reversal","display_name":"Disentanglement of Prosody Representations via Diffusion Models and Scheduled Gradient Reversal","publication_year":2025,"publication_date":"2025-02-24","ids":{"openalex":"https://openalex.org/W4407900542","doi":"https://doi.org/10.1109/tnnls.2025.3534822","pmid":"https://pubmed.ncbi.nlm.nih.gov/40031860"},"language":"en","primary_location":{"id":"doi:10.1109/tnnls.2025.3534822","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnnls.2025.3534822","pdf_url":null,"source":{"id":"https://openalex.org/S4210175523","display_name":"IEEE Transactions on Neural Networks and Learning Systems","issn_l":"2162-237X","issn":["2162-237X","2162-2388"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Neural Networks and Learning Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023714181","display_name":"Leyuan Qu","orcid":"https://orcid.org/0000-0001-6694-5355"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Leyuan Qu","raw_affiliation_strings":["Hangzhou Institute for Advanced Study, UCAS, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-6694-5355","affiliations":[{"raw_affiliation_string":"Hangzhou Institute for Advanced Study, UCAS, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102025003","display_name":"Cornelius Weber","orcid":"https://orcid.org/0000-0001-5163-938X"},"institutions":[{"id":"https://openalex.org/I159176309","display_name":"Universit\u00e4t Hamburg","ror":"https://ror.org/00g30e956","country_code":"DE","type":"education","lineage":["https://openalex.org/I159176309"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Cornelius Weber","raw_affiliation_strings":["Department of Informatics, University of Hamburg, Hamburg, Germany"],"raw_orcid":"https://orcid.org/0000-0001-5163-938X","affiliations":[{"raw_affiliation_string":"Department of Informatics, University of Hamburg, Hamburg, Germany","institution_ids":["https://openalex.org/I159176309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100392052","display_name":"Wei Wang","orcid":"https://orcid.org/0000-0002-6810-9518"},"institutions":[{"id":"https://openalex.org/I96908189","display_name":"Xinjiang University","ror":"https://ror.org/059gw8r13","country_code":"CN","type":"education","lineage":["https://openalex.org/I96908189"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Wang","raw_affiliation_strings":["International Cultural Exchange College, Xinjiang University, &#x00DC;r&#x00FC;mqi, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"International Cultural Exchange College, Xinjiang University, &#x00DC;r&#x00FC;mqi, China","institution_ids":["https://openalex.org/I96908189"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025340202","display_name":"Jia Jin","orcid":null},"institutions":[{"id":"https://openalex.org/I11406153","display_name":"Shanghai International Studies University","ror":"https://ror.org/01bn89z48","country_code":"CN","type":"education","lineage":["https://openalex.org/I11406153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jia Jin","raw_affiliation_strings":["School of Business and Management, Shanghai International Studies University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Business and Management, Shanghai International Studies University, Shanghai, China","institution_ids":["https://openalex.org/I11406153"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033063238","display_name":"Yingming Gao","orcid":"https://orcid.org/0000-0001-5881-3723"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yingming Gao","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5881-3723","affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021697903","display_name":"Taihao Li","orcid":"https://orcid.org/0000-0003-3279-7125"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Taihao Li","raw_affiliation_strings":["Hangzhou Institute for Advanced Study, UCAS, Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hangzhou Institute for Advanced Study, UCAS, Hangzhou, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5033486668","display_name":"Stefan Wermter","orcid":"https://orcid.org/0000-0003-1343-4775"},"institutions":[{"id":"https://openalex.org/I159176309","display_name":"Universit\u00e4t Hamburg","ror":"https://ror.org/00g30e956","country_code":"DE","type":"education","lineage":["https://openalex.org/I159176309"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Stefan Wermter","raw_affiliation_strings":["Department of Informatics, University of Hamburg, Hamburg, Germany"],"raw_orcid":"https://orcid.org/0000-0003-1343-4775","affiliations":[{"raw_affiliation_string":"Department of Informatics, University of Hamburg, Hamburg, Germany","institution_ids":["https://openalex.org/I159176309"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5023714181"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.1733,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.87281927,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"36","issue":"8","first_page":"15043","last_page":"15054"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9513000249862671,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9513000249862671,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9199000000953674,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.7505468726158142},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.5793260335922241},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4405980706214905},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.3785356283187866},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.23290887475013733},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.23218515515327454}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.7505468726158142},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.5793260335922241},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4405980706214905},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3785356283187866},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.23290887475013733},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.23218515515327454},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D004644","descriptor_name":"Emotions","qualifier_ui":"Q000502","qualifier_name":"physiology","is_major_topic":false},{"descriptor_ui":"D004644","descriptor_name":"Emotions","qualifier_ui":"Q000502","qualifier_name":"physiology","is_major_topic":false},{"descriptor_ui":"D004644","descriptor_name":"Emotions","qualifier_ui":"Q000502","qualifier_name":"physiology","is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D012660","descriptor_name":"Semantics","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D012660","descriptor_name":"Semantics","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D012660","descriptor_name":"Semantics","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":"Q000502","qualifier_name":"physiology","is_major_topic":true},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":"Q000502","qualifier_name":"physiology","is_major_topic":true},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":"Q000502","qualifier_name":"physiology","is_major_topic":true},{"descriptor_ui":"D013067","descriptor_name":"Speech Perception","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013067","descriptor_name":"Speech Perception","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013067","descriptor_name":"Speech Perception","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D016571","descriptor_name":"Neural Networks, Computer","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D016571","descriptor_name":"Neural Networks, Computer","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D016571","descriptor_name":"Neural Networks, Computer","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true}],"locations_count":2,"locations":[{"id":"doi:10.1109/tnnls.2025.3534822","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnnls.2025.3534822","pdf_url":null,"source":{"id":"https://openalex.org/S4210175523","display_name":"IEEE Transactions on Neural Networks and Learning Systems","issn_l":"2162-237X","issn":["2162-237X","2162-2388"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Neural Networks and Learning Systems","raw_type":"journal-article"},{"id":"pmid:40031860","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40031860","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on neural networks and learning systems","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7099999785423279,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[{"id":"https://openalex.org/G2029314019","display_name":null,"funder_award_id":"LQN25F020001","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5125208950","display_name":null,"funder_award_id":"72271166","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320315329","display_name":"Scientific Education and Research Foundation","ror":"https://ror.org/029zvkg32"},{"id":"https://openalex.org/F4320320879","display_name":"Deutsche Forschungsgemeinschaft","ror":"https://ror.org/018mejw64"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":72,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2017561954","https://openalex.org/W2085662862","https://openalex.org/W2137639365","https://openalex.org/W2146334809","https://openalex.org/W2150791533","https://openalex.org/W2159570759","https://openalex.org/W2402146185","https://openalex.org/W2629461003","https://openalex.org/W2752782242","https://openalex.org/W2904459034","https://openalex.org/W2928165649","https://openalex.org/W2936774411","https://openalex.org/W2941086874","https://openalex.org/W2949175740","https://openalex.org/W2963447013","https://openalex.org/W2969985801","https://openalex.org/W2972498864","https://openalex.org/W2973049979","https://openalex.org/W2973133192","https://openalex.org/W2995181338","https://openalex.org/W3006705189","https://openalex.org/W3006926732","https://openalex.org/W3015308237","https://openalex.org/W3015554124","https://openalex.org/W3024869864","https://openalex.org/W3096723250","https://openalex.org/W3101080567","https://openalex.org/W3130293557","https://openalex.org/W3135547455","https://openalex.org/W3162475537","https://openalex.org/W3200314169","https://openalex.org/W3204457821","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4210777104","https://openalex.org/W4221162872","https://openalex.org/W4224926192","https://openalex.org/W4287887366","https://openalex.org/W4296068587","https://openalex.org/W4313887688","https://openalex.org/W4366493008","https://openalex.org/W4367359628","https://openalex.org/W4375869114","https://openalex.org/W4385245566","https://openalex.org/W4387247604","https://openalex.org/W4390872387","https://openalex.org/W4390873135","https://openalex.org/W4392172995","https://openalex.org/W4392903514","https://openalex.org/W4392904442","https://openalex.org/W4392904630","https://openalex.org/W4392909765","https://openalex.org/W4402669711","https://openalex.org/W4406462025","https://openalex.org/W6631190155","https://openalex.org/W6639480849","https://openalex.org/W6679045638","https://openalex.org/W6683017116","https://openalex.org/W6691669583","https://openalex.org/W6737575990","https://openalex.org/W6750489868","https://openalex.org/W6755207826","https://openalex.org/W6762533536","https://openalex.org/W6763832098","https://openalex.org/W6776390925","https://openalex.org/W6778823374","https://openalex.org/W6783182287","https://openalex.org/W6803547063","https://openalex.org/W6810007534","https://openalex.org/W6810926057","https://openalex.org/W6838923520"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2355553914","https://openalex.org/W149862513","https://openalex.org/W2347684782","https://openalex.org/W187117048","https://openalex.org/W4320472397","https://openalex.org/W2401269021","https://openalex.org/W2145654520"],"abstract_inverted_index":{"Prosody":[0],"plays":[1],"a":[2,70,93,108,132,197,218],"fundamental":[3],"role":[4],"in":[5,119,170,196,200,225],"human":[6],"speech":[7,23,33,39],"and":[8,12,15,38,77,103,123,127,140,154,159,180,202,228],"communication,":[9],"facilitating":[10],"intelligibility":[11],"conveying":[13],"emotional":[14],"cognitive":[16],"states.":[17],"Extracting":[18],"accurate":[19],"prosodic":[20,53],"information":[21,102,173],"from":[22],"is":[24,45,138],"vital":[25],"for":[26,116,238],"building":[27],"assistive":[28],"technology,":[29],"such":[30,55,242],"as":[31,56,243],"controllable":[32],"synthesis,":[34],"speaking":[35],"style":[36,246],"transfer,":[37],"emotion":[40],"recognition":[41],"(SER).":[42],"However,":[43],"it":[44,98],"challenging":[46],"to":[47,75,82,125,217],"disentangle":[48,76,83],"speaker-independent":[49,178,192],"prosody":[50,79,84,109,113,130,144,193],"representations":[51,114],"since":[52],"attributes,":[54,62],"intonation,":[57],"excessively":[58],"entangle":[59],"with":[60,209],"speaker-specific":[61,172],"e.g.,":[63],"pitch.":[64],"In":[65,204],"this":[66],"article,":[67],"we":[68,86],"propose":[69],"novel":[71],"model,":[72],"called":[73],"Diffsody,":[74],"refine":[78,126],"representations:":[80],"1)":[81],"representations,":[85,131,194],"leverage":[87],"the":[88,143,164,167,210,232],"expressive":[89],"generative":[90],"ability":[91],"of":[92,146,166],"diffusion":[94],"model":[95,213,234],"by":[96],"conditioning":[97],"on":[99,177,254],"quantified":[100],"semantic":[101,211],"pretrained":[104],"speaker":[105,160],"embeddings.":[106],"Additionally,":[107],"encoder":[110,145],"automatically":[111],"learns":[112],"used":[115],"spectrogram":[117],"reconstruction":[118],"an":[120],"unsupervised":[121],"fashion;":[122],"2)":[124],"learn":[128],"speaker-invariant":[129],"scheduled":[133],"gradient":[134],"reversal":[135],"layer":[136],"(sGRL)":[137],"proposed":[139],"integrated":[141],"into":[142],"Diffsody.":[147],"We":[148],"extensively":[149],"evaluate":[150],"Diffsody":[151,188,206,233],"through":[152],"qualitative":[153],"quantitative":[155],"means.":[156],"t-SNE":[157],"visualization":[158],"verification":[161],"experiments":[162],"demonstrate":[163,186],"efficacy":[165],"sGRL":[168],"method":[169],"preventing":[171],"leakage.":[174],"Experimental":[175],"results":[176],"SER":[179,201,227],"automatic":[181],"depression":[182],"detection":[183],"(ADD)":[184],"tasks":[185],"that":[187],"can":[189,251],"efficiently":[190],"factorize":[191],"resulting":[195],"significant":[198],"boost":[199],"ADD.":[203],"addition,":[205],"synergistically":[207],"integrates":[208],"representation":[212],"WavLM,":[214],"which":[215],"leads":[216],"discernibly":[219],"elevated":[220],"performance,":[221],"outperforming":[222],"contemporary":[223],"methods":[224],"both":[226],"ADD":[229],"tasks.":[230],"Furthermore,":[231],"exhibits":[235],"promising":[236],"potential":[237],"various":[239],"practical":[240],"applications,":[241],"voice":[244],"or":[245],"conversion.":[247],"Some":[248],"audio":[249],"samples":[250],"be":[252],"found":[253],"our":[255],"https://leyuanqu.github.io/Diffsody/demo":[256],"website.":[257]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
