{"id":"https://openalex.org/W4385993818","doi":"https://doi.org/10.21437/ssw.2023-21","title":"An analysis on the effects of speaker embedding choice in non auto-regressive TTS","display_name":"An analysis on the effects of speaker embedding choice in non auto-regressive TTS","publication_year":2023,"publication_date":"2023-08-18","ids":{"openalex":"https://openalex.org/W4385993818","doi":"https://doi.org/10.21437/ssw.2023-21"},"language":"en","primary_location":{"id":"doi:10.21437/ssw.2023-21","is_oa":false,"landing_page_url":"http://dx.doi.org/10.21437/ssw.2023-21","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"12th ISCA Speech Synthesis Workshop (SSW2023)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://www.research.ed.ac.uk/en/publications/49108f6d-f426-42f1-9d98-a76632f47565","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018126799","display_name":"Adriana Stan","orcid":"https://orcid.org/0000-0003-2894-5770"},"institutions":[{"id":"https://openalex.org/I158333966","display_name":"Technical University of Cluj-Napoca","ror":"https://ror.org/03r8nwp71","country_code":"RO","type":"education","lineage":["https://openalex.org/I158333966"]}],"countries":["RO"],"is_corresponding":true,"raw_author_name":"Adriana Stan","raw_affiliation_strings":["Communications Department, Technical University of Cluj-Napoca, Romania"],"affiliations":[{"raw_affiliation_string":"Communications Department, Technical University of Cluj-Napoca, Romania","institution_ids":["https://openalex.org/I158333966"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5072423792","display_name":"Johannah O\u2019Mahony","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Johannah O'Mahony","raw_affiliation_strings":["Centre for Speech Technology Research, University of Edinburgh, United Kingdom"],"affiliations":[{"raw_affiliation_string":"Centre for Speech Technology Research, University of Edinburgh, United Kingdom","institution_ids":["https://openalex.org/I98677209"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5018126799"],"corresponding_institution_ids":["https://openalex.org/I158333966"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.09700118,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"134","last_page":"138"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7923155426979065},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.6939136981964111},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6893705725669861},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6289898157119751},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.6031429767608643},{"id":"https://openalex.org/keywords/abstraction","display_name":"Abstraction","score":0.5613660216331482},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5442119836807251},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.5391104817390442},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.5269765853881836},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5185095071792603},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4934150278568268},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4382176399230957},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.41975176334381104},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3786924481391907},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32602182030677795},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.08924013376235962}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7923155426979065},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.6939136981964111},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6893705725669861},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6289898157119751},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.6031429767608643},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.5613660216331482},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5442119836807251},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.5391104817390442},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.5269765853881836},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5185095071792603},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4934150278568268},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4382176399230957},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.41975176334381104},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3786924481391907},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32602182030677795},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.08924013376235962},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.21437/ssw.2023-21","is_oa":false,"landing_page_url":"http://dx.doi.org/10.21437/ssw.2023-21","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"12th ISCA Speech Synthesis Workshop (SSW2023)","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.ed.ac.uk:openaire/49108f6d-f426-42f1-9d98-a76632f47565","is_oa":true,"landing_page_url":"https://www.research.ed.ac.uk/en/publications/49108f6d-f426-42f1-9d98-a76632f47565","pdf_url":null,"source":{"id":"https://openalex.org/S4406922455","display_name":"Edinburgh Research Explorer","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Stan, A & O'Mahony, J 2023, An analysis on the effects of speaker embedding choice in non auto-regressive TTS. in G Bailly, T Hueber, D Lolive, N Obin & O Perrotin (eds), Proceedings of the 12th ISCA Speech Synthesis Workshop : (SSW2023). Proceedings of the ISCA Workshop, Grenoble, pp. 134-138, 12th ISCA Speech Synthesis Workshop , Grenoble, France, 26/08/23. https://doi.org/10.21437/SSW.2023-21","raw_type":"contributionToPeriodical"},{"id":"pmh:oai:pure.ed.ac.uk:publications/49108f6d-f426-42f1-9d98-a76632f47565","is_oa":true,"landing_page_url":"https://www.isca-speech.org/archive/ssw_2023/stan23_ssw.html","pdf_url":null,"source":{"id":"https://openalex.org/S4306400321","display_name":"Edinburgh Research Explorer (University of Edinburgh)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I98677209","host_organization_name":"University of Edinburgh","host_organization_lineage":["https://openalex.org/I98677209"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Stan, A & O'Mahony, J 2023, An analysis on the effects of speaker embedding choice in non auto-regressive TTS. in G Bailly, T Hueber, D Lolive, N Obin & O Perrotin (eds), Proceedings of the 12th ISCA Speech Synthesis Workshop : (SSW2023). Proceedings of the ISCA Workshop, Grenoble, pp. 134-138, 12th ISCA Speech Synthesis Workshop , Grenoble, France, 26/08/23. https://doi.org/10.21437/SSW.2023-21","raw_type":"contributionToPeriodical"}],"best_oa_location":{"id":"pmh:oai:pure.ed.ac.uk:openaire/49108f6d-f426-42f1-9d98-a76632f47565","is_oa":true,"landing_page_url":"https://www.research.ed.ac.uk/en/publications/49108f6d-f426-42f1-9d98-a76632f47565","pdf_url":null,"source":{"id":"https://openalex.org/S4406922455","display_name":"Edinburgh Research Explorer","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Stan, A & O'Mahony, J 2023, An analysis on the effects of speaker embedding choice in non auto-regressive TTS. in G Bailly, T Hueber, D Lolive, N Obin & O Perrotin (eds), Proceedings of the 12th ISCA Speech Synthesis Workshop : (SSW2023). Proceedings of the ISCA Workshop, Grenoble, pp. 134-138, 12th ISCA Speech Synthesis Workshop , Grenoble, France, 26/08/23. https://doi.org/10.21437/SSW.2023-21","raw_type":"contributionToPeriodical"},"sustainable_development_goals":[{"score":0.4399999976158142,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W2150769028","https://openalex.org/W2738884019","https://openalex.org/W2807627734","https://openalex.org/W2808706139","https://openalex.org/W3008391559","https://openalex.org/W3015826515","https://openalex.org/W3024869864","https://openalex.org/W3090254849","https://openalex.org/W3092028330","https://openalex.org/W3096086473","https://openalex.org/W3150572638","https://openalex.org/W3161704465","https://openalex.org/W3208065268","https://openalex.org/W4225680573","https://openalex.org/W4285666836","https://openalex.org/W4307273323","https://openalex.org/W4307307993","https://openalex.org/W4320459320"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W4247736853","https://openalex.org/W2162158162","https://openalex.org/W1493012537","https://openalex.org/W1999004162","https://openalex.org/W2175373321","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2938358845","https://openalex.org/W2997340161"],"abstract_inverted_index":{"In":[0,48],"this":[1],"paper":[2],"we":[3,52],"introduce":[4],"a":[5,11,49],"first":[6],"attempt":[7],"on":[8],"understanding":[9],"how":[10,54],"non-autoregressive":[12],"factorised":[13],"multi-speaker":[14],"speech":[15,64,103],"synthesis":[16,116],"architecture":[17],"exploits":[18],"the":[19,32,55,61,81,89,111,115,121],"information":[20],"present":[21],"in":[22,68,102,120],"different":[23,56],"speaker":[24,46,71,94,108],"embedding":[25],"sets.":[26],"We":[27,76],"analyse":[28],"if":[29],"jointly":[30],"learning":[31,87],"representations,":[33],"and":[34,73,86,106],"initialising":[35],"them":[36],"from":[37],"pretrained":[38],"models":[39],"determine":[40],"any":[41],"quality":[42],"improvements":[43],"for":[44],"target":[45],"identities.":[47],"separate":[50],"analysis,":[51],"investigate":[53],"sets":[57],"of":[58,70,80,84,114],"embeddings":[59,85],"impact":[60],"network\u2019s":[62],"core":[63,112],"abstraction":[65],"(i.e.zero":[66],"conditioned)":[67],"terms":[69],"identity":[72],"representation":[74],"learning.":[75],"show":[77],"that,":[78],"regardless":[79],"used":[82],"set":[83],"strategy,":[88],"network":[90],"can":[91],"handle":[92],"various":[93],"identities":[95],"equally":[96],"well,":[97],"with":[98],"barely":[99],"noticeable":[100],"variations":[101],"output":[104],"quality,":[105],"that":[107],"leakage":[109],"within":[110],"structure":[113],"system":[117],"is":[118],"inevitable":[119],"standard":[122],"training":[123],"procedures":[124],"adopted":[125],"thus":[126],"far.":[127]},"counts_by_year":[],"updated_date":"2025-12-21T23:12:01.093139","created_date":"2025-10-10T00:00:00"}
