{"id":"https://openalex.org/W4387247604","doi":"https://doi.org/10.1109/taslp.2023.3320864","title":"Disentangling Prosody Representations With Unsupervised Speech Reconstruction","display_name":"Disentangling Prosody Representations With Unsupervised Speech Reconstruction","publication_year":2023,"publication_date":"2023-10-02","ids":{"openalex":"https://openalex.org/W4387247604","doi":"https://doi.org/10.1109/taslp.2023.3320864"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3320864","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3320864","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/6633080/10269014.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://ieeexplore.ieee.org/ielx7/6570655/6633080/10269014.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023714181","display_name":"Leyuan Qu","orcid":"https://orcid.org/0000-0001-6694-5355"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Leyuan Qu","raw_affiliation_strings":["Institute of Artificial Intelligence, Zhejiang Lab, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Institute of Artificial Intelligence, Zhejiang Lab, Hangzhou, China","institution_ids":["https://openalex.org/I4210123185"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021697903","display_name":"Taihao Li","orcid":"https://orcid.org/0000-0003-3279-7125"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Taihao Li","raw_affiliation_strings":["Institute of Artificial Intelligence, Zhejiang Lab, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Institute of Artificial Intelligence, Zhejiang Lab, Hangzhou, China","institution_ids":["https://openalex.org/I4210123185"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102025003","display_name":"Cornelius Weber","orcid":"https://orcid.org/0000-0001-5163-938X"},"institutions":[{"id":"https://openalex.org/I159176309","display_name":"Universit\u00e4t Hamburg","ror":"https://ror.org/00g30e956","country_code":"DE","type":"education","lineage":["https://openalex.org/I159176309"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Cornelius Weber","raw_affiliation_strings":["Department of Informatics, University of Hamburg, Hamburg, Germany"],"affiliations":[{"raw_affiliation_string":"Department of Informatics, University of Hamburg, Hamburg, Germany","institution_ids":["https://openalex.org/I159176309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092489723","display_name":"Theresa Pekarek Rosin","orcid":"https://orcid.org/0000-0002-2847-5097"},"institutions":[{"id":"https://openalex.org/I159176309","display_name":"Universit\u00e4t Hamburg","ror":"https://ror.org/00g30e956","country_code":"DE","type":"education","lineage":["https://openalex.org/I159176309"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Theresa Pekarek-Rosin","raw_affiliation_strings":["Department of Informatics, University of Hamburg, Hamburg, Germany"],"affiliations":[{"raw_affiliation_string":"Department of Informatics, University of Hamburg, Hamburg, Germany","institution_ids":["https://openalex.org/I159176309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071943346","display_name":"Fuji Ren","orcid":"https://orcid.org/0000-0003-4860-9184"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fuji Ren","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5033486668","display_name":"Stefan Wermter","orcid":"https://orcid.org/0000-0003-1343-4775"},"institutions":[{"id":"https://openalex.org/I159176309","display_name":"Universit\u00e4t Hamburg","ror":"https://ror.org/00g30e956","country_code":"DE","type":"education","lineage":["https://openalex.org/I159176309"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Stefan Wermter","raw_affiliation_strings":["Department of Informatics, University of Hamburg, Hamburg, Germany"],"affiliations":[{"raw_affiliation_string":"Department of Informatics, University of Hamburg, Hamburg, Germany","institution_ids":["https://openalex.org/I159176309"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5023714181"],"corresponding_institution_ids":["https://openalex.org/I4210123185"],"apc_list":null,"apc_paid":null,"fwci":1.8942,"has_fulltext":true,"cited_by_count":11,"citation_normalized_percentile":{"value":0.88693875,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"32","issue":null,"first_page":"39","last_page":"54"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.8600900769233704},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7296539545059204},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6775761246681213},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.6072003841400146},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5067625641822815},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.4810355305671692},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4721943140029907},{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.4283980131149292},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.11477136611938477}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.8600900769233704},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7296539545059204},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6775761246681213},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.6072003841400146},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5067625641822815},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.4810355305671692},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4721943140029907},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.4283980131149292},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.11477136611938477},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2023.3320864","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3320864","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/6633080/10269014.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1109/taslp.2023.3320864","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3320864","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/6633080/10269014.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G762232396","display_name":null,"funder_award_id":"Project","funder_id":"https://openalex.org/F4320320879","funder_display_name":"Deutsche Forschungsgemeinschaft"}],"funders":[{"id":"https://openalex.org/F4320320879","display_name":"Deutsche Forschungsgemeinschaft","ror":"https://ror.org/018mejw64"},{"id":"https://openalex.org/F4320329860","display_name":"National Science and Technology Major Project","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4387247604.pdf","grobid_xml":"https://content.openalex.org/works/W4387247604.grobid-xml"},"referenced_works_count":97,"referenced_works":["https://openalex.org/W648786980","https://openalex.org/W854541894","https://openalex.org/W1494198834","https://openalex.org/W1522301498","https://openalex.org/W1686946872","https://openalex.org/W2009375902","https://openalex.org/W2050681655","https://openalex.org/W2069924379","https://openalex.org/W2130821326","https://openalex.org/W2146334809","https://openalex.org/W2187089797","https://openalex.org/W2342475039","https://openalex.org/W2402146185","https://openalex.org/W2511640485","https://openalex.org/W2583542555","https://openalex.org/W2742542661","https://openalex.org/W2748654097","https://openalex.org/W2748702193","https://openalex.org/W2752782242","https://openalex.org/W2774085128","https://openalex.org/W2785978752","https://openalex.org/W2794490148","https://openalex.org/W2795109282","https://openalex.org/W2803193013","https://openalex.org/W2808631503","https://openalex.org/W2889374687","https://openalex.org/W2891205112","https://openalex.org/W2896457183","https://openalex.org/W2904459034","https://openalex.org/W2928165649","https://openalex.org/W2936774411","https://openalex.org/W2945478979","https://openalex.org/W2963199341","https://openalex.org/W2964243274","https://openalex.org/W2972498864","https://openalex.org/W2973049979","https://openalex.org/W2973181312","https://openalex.org/W2995181338","https://openalex.org/W3015241559","https://openalex.org/W3015884429","https://openalex.org/W3016181583","https://openalex.org/W3024869864","https://openalex.org/W3025680351","https://openalex.org/W3034794073","https://openalex.org/W3036601975","https://openalex.org/W3096723250","https://openalex.org/W3112034174","https://openalex.org/W3112594642","https://openalex.org/W3130293557","https://openalex.org/W3135547455","https://openalex.org/W3160039712","https://openalex.org/W3175161143","https://openalex.org/W3197580070","https://openalex.org/W3197993066","https://openalex.org/W3204087964","https://openalex.org/W3204457821","https://openalex.org/W3205428167","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3211224152","https://openalex.org/W4200635083","https://openalex.org/W4205742757","https://openalex.org/W4221145109","https://openalex.org/W4221147462","https://openalex.org/W4221162872","https://openalex.org/W4224918091","https://openalex.org/W4225939199","https://openalex.org/W4226487411","https://openalex.org/W4285251897","https://openalex.org/W4287073476","https://openalex.org/W4287887366","https://openalex.org/W4295731579","https://openalex.org/W4301371414","https://openalex.org/W4311000453","https://openalex.org/W4312120641","https://openalex.org/W4313887688","https://openalex.org/W4319988532","https://openalex.org/W4385574033","https://openalex.org/W6621543089","https://openalex.org/W6623517193","https://openalex.org/W6631190155","https://openalex.org/W6637108112","https://openalex.org/W6746468907","https://openalex.org/W6750489868","https://openalex.org/W6754420807","https://openalex.org/W6755207826","https://openalex.org/W6762533536","https://openalex.org/W6776390925","https://openalex.org/W6780218876","https://openalex.org/W6786669483","https://openalex.org/W6795949861","https://openalex.org/W6803378298","https://openalex.org/W6803547063","https://openalex.org/W6810007534","https://openalex.org/W6810585344","https://openalex.org/W6847363464","https://openalex.org/W6849896277"],"related_works":["https://openalex.org/W2159052453","https://openalex.org/W3013693939","https://openalex.org/W2566616303","https://openalex.org/W3131327266","https://openalex.org/W4297051394","https://openalex.org/W2752972570","https://openalex.org/W2406877384","https://openalex.org/W2734887215","https://openalex.org/W2803255133","https://openalex.org/W2595839522"],"abstract_inverted_index":{"Human":[0],"speech":[1,96,114,124,162,243],"can":[2,208,260],"be":[3,209,261],"characterized":[4],"by":[5,231],"different":[6,59],"components,":[7],"including":[8],"semantic":[9,25,130],"content,":[10,131],"speaker":[11,28,36,135,140],"identity":[12,29,141],"and":[13,27,35,64,66,79,106,143,177,186,189,235,246],"prosodic":[14,51,205],"information.":[15],"Significant":[16],"progress":[17],"has":[18],"been":[19],"made":[20],"in":[21,30,111],"disentangling":[22],"representations":[23,158],"for":[24,71,129,237],"content":[26],"Automatic":[31],"Speech":[32,173],"Recognition":[33,175],"(ASR)":[34],"verification":[37,136],"tasks":[38],"respectively.":[39],"However,":[40],"it":[41],"is":[42,87],"still":[43],"an":[44],"open":[45],"challenging":[46],"research":[47],"question":[48],"to":[49,75,88,138,149,171,212],"extract":[50],"information":[52],"because":[53,67],"of":[54,58,68,84,92,240],"the":[55,69,90,156,166,196,222,227,238,248],"intrinsic":[56],"association":[57],"attributes,":[60],"such":[61],"as":[62],"timbre":[63],"rhythm,":[65],"need":[70],"supervised":[72],"training":[73],"schemes":[74],"achieve":[76],"robust":[77],"large-scale":[78],"speaker-independent":[80],"ASR.":[81],"The":[82],"aim":[83],"this":[85],"paper":[86],"address":[89],"disentanglement":[91],"emotional":[93,161,214],"prosody":[94,147,151,228],"from":[95],"based":[97],"on":[98,159,168,195,221,263],"unsupervised":[99],"reconstruction.":[100],"Specifically,":[101],"we":[102],"identify,":[103],"design,":[104],"implement":[105],"integrate":[107],"three":[108],"crucial":[109],"components":[110],"our":[112,218,264],"proposed":[113],"reconstruction":[115],"model":[116,137,167],"Prosody2Vec:":[117],"(1)":[118],"a":[119,133,145],"unit":[120],"encoder":[121,148],"that":[122,200,207,226],"transforms":[123],"signals":[125],"into":[126],"discrete":[127],"units":[128],"(2)":[132],"pretrained":[134],"generate":[139],"embeddings,":[142],"(3)":[144],"trainable":[146],"learn":[150],"representations.":[152,256],"We":[153],"first":[154],"pretrain":[155],"Prosody2Vec":[157,201,232,253],"unlabelled":[160],"corpora,":[163],"then":[164],"fine-tune":[165],"specific":[169],"datasets":[170],"perform":[172],"Emotion":[174],"(SER)":[176],"Emotional":[178],"Voice":[179],"Conversion":[180],"(EVC)":[181],"tasks.":[182],"Both":[183],"objective":[184],"(weighted":[185],"unweighted":[187],"accuracies)":[188],"subjective":[190],"(mean":[191],"opinion":[192],"score)":[193],"evaluations":[194],"EVC":[197],"task":[198],"suggest":[199],"effectively":[202],"captures":[203],"general":[204],"features":[206,229],"smoothly":[210],"transferred":[211],"other":[213],"speech.":[215],"In":[216],"addition,":[217],"SER":[219],"experiments":[220],"IEMOCAP":[223],"dataset":[224],"reveal":[225],"learned":[230],"are":[233],"complementary":[234],"beneficial":[236],"performance":[239],"widely":[241],"used":[242],"pretraining":[244],"models":[245],"surpass":[247],"state-of-the-art":[249],"methods":[250],"when":[251],"combining":[252],"with":[254],"HuBERT":[255],"Some":[257],"audio":[258],"samples":[259],"found":[262],"demo":[265],"website":[266]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-10-10T00:00:00"}
