{"id":"https://openalex.org/W4395447377","doi":"https://doi.org/10.1109/taslp.2024.3393714","title":"USAT: A Universal Speaker-Adaptive Text-to-Speech Approach","display_name":"USAT: A Universal Speaker-Adaptive Text-to-Speech Approach","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4395447377","doi":"https://doi.org/10.1109/taslp.2024.3393714"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3393714","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3393714","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2404.18094","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100329719","display_name":"Wenbin Wang","orcid":"https://orcid.org/0000-0001-9710-0136"},"institutions":[{"id":"https://openalex.org/I31746571","display_name":"UNSW Sydney","ror":"https://ror.org/03r8z3t63","country_code":"AU","type":"education","lineage":["https://openalex.org/I31746571"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Wenbin Wang","raw_affiliation_strings":["School of Computer Science and Engineering, University of New South Wales, Kensington, NSW, Australia"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of New South Wales, Kensington, NSW, Australia","institution_ids":["https://openalex.org/I31746571"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041567418","display_name":"Yang Song","orcid":"https://orcid.org/0000-0003-1283-1672"},"institutions":[{"id":"https://openalex.org/I31746571","display_name":"UNSW Sydney","ror":"https://ror.org/03r8z3t63","country_code":"AU","type":"education","lineage":["https://openalex.org/I31746571"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Yang Song","raw_affiliation_strings":["School of Computer Science and Engineering, University of New South Wales, Kensington, NSW, Australia"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of New South Wales, Kensington, NSW, Australia","institution_ids":["https://openalex.org/I31746571"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5015261020","display_name":"Sanjay Jha","orcid":"https://orcid.org/0000-0002-1844-1520"},"institutions":[{"id":"https://openalex.org/I31746571","display_name":"UNSW Sydney","ror":"https://ror.org/03r8z3t63","country_code":"AU","type":"education","lineage":["https://openalex.org/I31746571"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Sanjay Jha","raw_affiliation_strings":["School of Computer Science and Engineering, University of New South Wales, Kensington, NSW, Australia"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of New South Wales, Kensington, NSW, Australia","institution_ids":["https://openalex.org/I31746571"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100329719"],"corresponding_institution_ids":["https://openalex.org/I31746571"],"apc_list":null,"apc_paid":null,"fwci":4.6071,"has_fulltext":true,"cited_by_count":13,"citation_normalized_percentile":{"value":0.95185956,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":"32","issue":null,"first_page":"2590","last_page":"2604"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7608832120895386},{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.7128865122795105},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6803930997848511},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.6185230016708374},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.5901888608932495},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.586955726146698},{"id":"https://openalex.org/keywords/forgetting","display_name":"Forgetting","score":0.5654259920120239},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.376376211643219},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.1321004331111908},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.09512582421302795},{"id":"https://openalex.org/keywords/cognitive-psychology","display_name":"Cognitive psychology","score":0.07926034927368164},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.06557813286781311}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7608832120895386},{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.7128865122795105},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6803930997848511},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6185230016708374},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.5901888608932495},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.586955726146698},{"id":"https://openalex.org/C7149132","wikidata":"https://www.wikidata.org/wiki/Q1377840","display_name":"Forgetting","level":2,"score":0.5654259920120239},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.376376211643219},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.1321004331111908},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.09512582421302795},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.07926034927368164},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.06557813286781311},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/taslp.2024.3393714","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3393714","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2404.18094","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.18094","pdf_url":"https://arxiv.org/pdf/2404.18094","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2404.18094","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.18094","pdf_url":"https://arxiv.org/pdf/2404.18094","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4395447377.pdf","grobid_xml":"https://content.openalex.org/works/W4395447377.grobid-xml"},"referenced_works_count":83,"referenced_works":["https://openalex.org/W1574170747","https://openalex.org/W1857789879","https://openalex.org/W2094721231","https://openalex.org/W2160084280","https://openalex.org/W2299546089","https://openalex.org/W2745644908","https://openalex.org/W2794506738","https://openalex.org/W2888954148","https://openalex.org/W2891080700","https://openalex.org/W2911412811","https://openalex.org/W2928165649","https://openalex.org/W2962788625","https://openalex.org/W2963371159","https://openalex.org/W2963925437","https://openalex.org/W2964243274","https://openalex.org/W2972359262","https://openalex.org/W2973158936","https://openalex.org/W2998572311","https://openalex.org/W3015783745","https://openalex.org/W3015826515","https://openalex.org/W3015853838","https://openalex.org/W3016159759","https://openalex.org/W3024869864","https://openalex.org/W3026777299","https://openalex.org/W3090254849","https://openalex.org/W3095035471","https://openalex.org/W3096514088","https://openalex.org/W3097297926","https://openalex.org/W3097538987","https://openalex.org/W3161109662","https://openalex.org/W3161704465","https://openalex.org/W3174758275","https://openalex.org/W3189951784","https://openalex.org/W3197103763","https://openalex.org/W3205878676","https://openalex.org/W3213544594","https://openalex.org/W4200300291","https://openalex.org/W4221156079","https://openalex.org/W4221166168","https://openalex.org/W4225746985","https://openalex.org/W4225956675","https://openalex.org/W4287641946","https://openalex.org/W4296068974","https://openalex.org/W4296070391","https://openalex.org/W4297841232","https://openalex.org/W4307783813","https://openalex.org/W4313679638","https://openalex.org/W4372267700","https://openalex.org/W4379924545","https://openalex.org/W4385245566","https://openalex.org/W4385822372","https://openalex.org/W4385822697","https://openalex.org/W4385895956","https://openalex.org/W4390685975","https://openalex.org/W4391020683","https://openalex.org/W4391468030","https://openalex.org/W6605232188","https://openalex.org/W6639480849","https://openalex.org/W6734815144","https://openalex.org/W6748588790","https://openalex.org/W6749489859","https://openalex.org/W6749555683","https://openalex.org/W6757817989","https://openalex.org/W6772349387","https://openalex.org/W6776605602","https://openalex.org/W6777335856","https://openalex.org/W6777694618","https://openalex.org/W6778823374","https://openalex.org/W6783527727","https://openalex.org/W6790220310","https://openalex.org/W6795261426","https://openalex.org/W6795807602","https://openalex.org/W6796464841","https://openalex.org/W6796581206","https://openalex.org/W6796730497","https://openalex.org/W6805710207","https://openalex.org/W6846600677","https://openalex.org/W6847363464","https://openalex.org/W6848735303","https://openalex.org/W6851724922","https://openalex.org/W6852870047","https://openalex.org/W6853937136","https://openalex.org/W6855540781"],"related_works":["https://openalex.org/W4362597605","https://openalex.org/W1574414179","https://openalex.org/W4297676672","https://openalex.org/W3009056573","https://openalex.org/W2922073769","https://openalex.org/W4281702477","https://openalex.org/W2490526372","https://openalex.org/W4376166922","https://openalex.org/W4378510483","https://openalex.org/W4320853675"],"abstract_inverted_index":{"Conventional":[0],"text-to-speech":[1],"(TTS)":[2],"research":[3],"has":[4],"predominantly":[5],"focused":[6],"on":[7,127,164,252],"enhancing":[8],"the":[9,17,67,89,169,189],"quality":[10],"of":[11,22,69,91,121,129,137,233,245],"synthesized":[12],"speech":[13,25,190],"for":[14,26,188,200],"speakers":[15,70,139],"in":[16,174],"training":[18],"dataset.":[19],"The":[20],"challenge":[21],"synthesizing":[23],"lifelike":[24],"unseen,":[27],"out-of-dataset":[28],"speakers,":[29,131,228,260],"especially":[30],"those":[31],"with":[32,71,114,140],"limited":[33],"reference":[34],"data,":[35],"remains":[36],"a":[37,84,134,185,209,216,230],"significant":[38,85],"and":[39,88,93,149,159,183,196,208,258,269],"unresolved":[40],"problem.":[41],"While":[42,74],"zero-shot":[43,103,175],"or":[44,104],"few-shot":[45,75,105,150,201],"speakeradaptive":[46,122],"TTS":[47,123,218,247],"approaches":[48,57,99],"have":[49,53],"been":[50],"explored,":[51],"they":[52,82],"many":[54],"limitations.":[55],"Zero-shot":[56],"tend":[58],"to":[59,65,241],"suffer":[60],"from":[61,225],"insufficient":[62,170],"generalization":[63,171],"performance":[64,172],"reproduce":[66,78],"voice":[68],"heavy":[72],"accents.":[73,142,236],"methods":[76],"can":[77],"highly":[79],"varying":[80],"accents,":[81],"bring":[83],"storage":[86,198],"burden":[87],"risk":[90],"overfitting":[92],"catastrophic":[94,194],"forgetting.":[95],"In":[96],"addition,":[97],"prior":[98],"only":[100,126],"provide":[101],"either":[102],"adaptation,":[106,177,203],"constraining":[107],"their":[108,165],"utility":[109],"across":[110,266],"varied":[111],"real-world":[112],"scenarios":[113],"different":[115],"demands.":[116],"Besides,":[117],"most":[118],"current":[119],"evaluations":[120,244],"are":[124],"conducted":[125],"datasets":[128,254],"native":[130,257],"inadvertently":[132],"neglecting":[133],"vast":[135],"portion":[136],"non-native":[138,227,234,259],"diverse":[141],"Our":[143],"proposed":[144],"framework":[145],"unifies":[146],"both":[147,256],"zeroshot":[148],"speaker":[151,176,202],"adaptation":[152,211],"strategies,":[153],"which":[154],"we":[155,178,204,214],"term":[156],"as":[157],"\u201cinstant\u201d":[158],"\u201cfine-grained\u201d":[160],"adaptations,":[161],"respectively,":[162],"based":[163],"merits.":[166],"To":[167,192],"alleviate":[168],"observed":[173],"designed":[179,205],"two":[180,206],"innovative":[181],"discriminators":[182],"introduced":[184],"memory":[186],"mechanism":[187],"decoder.":[191],"prevent":[193],"forgetting":[195],"reduce":[197],"implications":[199],"adapters":[207],"unique":[210],"procedure.":[212],"Additionally,":[213],"introduce":[215],"new":[217],"dataset":[219,238],"that":[220],"encompasses":[221],"44,000":[222],"English":[223,235],"utterances":[224],"134":[226],"capturing":[229],"wide":[231],"array":[232],"This":[237],"is":[239],"intended":[240],"enhance":[242],"holistic":[243],"adaptive":[246],"capabilities.":[248],"Through":[249],"comprehensive":[250],"experiments":[251],"multiple":[253],"comprising":[255],"our":[261],"approach":[262],"outperforms":[263],"contemporary":[264],"methodologies":[265],"various":[267],"subjective":[268],"objective":[270],"metrics.":[271]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":5}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
