{"id":"https://openalex.org/W2936295285","doi":"https://doi.org/10.21437/interspeech.2019-1518","title":"Unsupervised Acoustic Unit Discovery for Speech Synthesis Using Discrete Latent-Variable Neural Networks","display_name":"Unsupervised Acoustic Unit Discovery for Speech Synthesis Using Discrete Latent-Variable Neural Networks","publication_year":2019,"publication_date":"2019-09-13","ids":{"openalex":"https://openalex.org/W2936295285","doi":"https://doi.org/10.21437/interspeech.2019-1518","mag":"2936295285"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2019-1518","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2019-1518","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2019","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1904.07556","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047706306","display_name":"Ryan Eloff","orcid":null},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":true,"raw_author_name":"Ryan Eloff","raw_affiliation_strings":["Stellenbosch University, Stellenbosch, South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University, Stellenbosch, South Africa","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110260045","display_name":"Andr\u00e9 Nortje","orcid":null},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Andr\u00e9 Nortje","raw_affiliation_strings":["Stellenbosch University, Stellenbosch, South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University, Stellenbosch, South Africa","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046382731","display_name":"Benjamin van Niekerk","orcid":"https://orcid.org/0000-0001-9207-6309"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Benjamin van Niekerk","raw_affiliation_strings":["Stellenbosch University, Stellenbosch, South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University, Stellenbosch, South Africa","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015106629","display_name":"Avashna Govender","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Avashna Govender","raw_affiliation_strings":["University of Edinburgh, Edinburgh, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh, Edinburgh, United Kingdom","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020033363","display_name":"Leanne Nortje","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leanne Nortje","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017043594","display_name":"Arnu Pretorius","orcid":null},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Arnu Pretorius","raw_affiliation_strings":["Stellenbosch University, Stellenbosch, South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University, Stellenbosch, South Africa","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080963336","display_name":"Elan van Biljon","orcid":null},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Elan van Biljon","raw_affiliation_strings":["Stellenbosch University, Stellenbosch, South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University, Stellenbosch, South Africa","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039075258","display_name":"Ewald van der Westhuizen","orcid":"https://orcid.org/0000-0002-7430-503X"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Ewald van der Westhuizen","raw_affiliation_strings":["Stellenbosch University, Stellenbosch, South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University, Stellenbosch, South Africa","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010864987","display_name":"Lisa van Staden","orcid":null},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Lisa van Staden","raw_affiliation_strings":["Stellenbosch University, Stellenbosch, South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University, Stellenbosch, South Africa","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040305929","display_name":"Herman Kamper","orcid":"https://orcid.org/0000-0003-2980-3475"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Herman Kamper","raw_affiliation_strings":["StellenBosch University"],"affiliations":[{"raw_affiliation_string":"StellenBosch University","institution_ids":["https://openalex.org/I26092322"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5047706306"],"corresponding_institution_ids":["https://openalex.org/I26092322"],"apc_list":null,"apc_paid":null,"fwci":1.4466,"has_fulltext":true,"cited_by_count":10,"citation_normalized_percentile":{"value":0.86483335,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1103","last_page":"1107"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.7887523174285889},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.733277440071106},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.653933048248291},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5991826057434082},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.5170453190803528},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4928395748138428},{"id":"https://openalex.org/keywords/latent-variable","display_name":"Latent variable","score":0.4581623673439026},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4486178159713745},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.43491220474243164},{"id":"https://openalex.org/keywords/unsupervised-learning","display_name":"Unsupervised learning","score":0.4262910485267639},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.38728609681129456},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.1228884756565094}],"concepts":[{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.7887523174285889},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.733277440071106},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.653933048248291},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5991826057434082},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.5170453190803528},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4928395748138428},{"id":"https://openalex.org/C51167844","wikidata":"https://www.wikidata.org/wiki/Q4422623","display_name":"Latent variable","level":2,"score":0.4581623673439026},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4486178159713745},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.43491220474243164},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.4262910485267639},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.38728609681129456},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.1228884756565094}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.21437/interspeech.2019-1518","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2019-1518","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2019","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1904.07556","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1904.07556","pdf_url":"https://arxiv.org/pdf/1904.07556","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:2936295285","is_oa":true,"landing_page_url":"https://arxiv.org/abs/1904.07556","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.1904.07556","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1904.07556","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1904.07556","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1904.07556","pdf_url":"https://arxiv.org/pdf/1904.07556","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.7799999713897705,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"},{"id":"https://openalex.org/F4320332999","display_name":"Horizon 2020 Framework Programme","ror":"https://ror.org/00k4n6c32"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2936295285.pdf","grobid_xml":"https://content.openalex.org/works/W2936295285.grobid-xml"},"referenced_works_count":42,"referenced_works":["https://openalex.org/W52412328","https://openalex.org/W1577418252","https://openalex.org/W1778492285","https://openalex.org/W1796128977","https://openalex.org/W1967924372","https://openalex.org/W2010188467","https://openalex.org/W2020607164","https://openalex.org/W2025482506","https://openalex.org/W2035424729","https://openalex.org/W2091746061","https://openalex.org/W2126203737","https://openalex.org/W2134202996","https://openalex.org/W2148154194","https://openalex.org/W2242818861","https://openalex.org/W2346964103","https://openalex.org/W2347098582","https://openalex.org/W2395899413","https://openalex.org/W2396043527","https://openalex.org/W2404799143","https://openalex.org/W2476548250","https://openalex.org/W2547039119","https://openalex.org/W2547875792","https://openalex.org/W2548228487","https://openalex.org/W2598638573","https://openalex.org/W2787447541","https://openalex.org/W2826003142","https://openalex.org/W2890983311","https://openalex.org/W2911249026","https://openalex.org/W2945769669","https://openalex.org/W2949382160","https://openalex.org/W2951004968","https://openalex.org/W2962790638","https://openalex.org/W2963149687","https://openalex.org/W2963618559","https://openalex.org/W2963619462","https://openalex.org/W2963620343","https://openalex.org/W2963799213","https://openalex.org/W2963830550","https://openalex.org/W2964115348","https://openalex.org/W2964121744","https://openalex.org/W2973026522","https://openalex.org/W3125709657"],"related_works":["https://openalex.org/W3125709657","https://openalex.org/W3095361818","https://openalex.org/W2087368178","https://openalex.org/W2949510815","https://openalex.org/W3161411634","https://openalex.org/W2156692643","https://openalex.org/W2950523597","https://openalex.org/W2095871591","https://openalex.org/W2015633636","https://openalex.org/W2165931417","https://openalex.org/W2291725357","https://openalex.org/W2400957076","https://openalex.org/W2973049979","https://openalex.org/W2963799213","https://openalex.org/W2947591107","https://openalex.org/W2842511635","https://openalex.org/W2547875792","https://openalex.org/W2566781703","https://openalex.org/W2112173325","https://openalex.org/W2097431866"],"abstract_inverted_index":{"For":[0,118],"our":[1],"submission":[2],"to":[3,14,98,106,110,169],"the":[4,19,67,71,114,170],"ZeroSpeech":[5],"2019":[6],"challenge,":[7],"we":[8],"apply":[9],"discrete":[10,26,161],"latent-variable":[11],"neural":[12,108],"networks":[13],"unlabelled":[15],"speech":[16,23,43,83,112],"and":[17,127,149],"use":[18,49],"discovered":[20],"units":[21],"for":[22,32],"synthesis.":[24],"Unsupervised":[25],"subword":[27],"modelling":[28,64],"could":[29],"be":[30],"useful":[31],"studies":[33],"of":[34],"phonetic":[35],"category":[36],"learning":[37],"in":[38,41,113],"infants":[39],"or":[40],"low-resource":[42],"technology":[44],"requiring":[45],"symbolic":[46],"input.":[47],"We":[48,57,153],"an":[50,85,150],"autoencoder":[51],"(AE)":[52],"architecture":[53],"with":[54],"intermediate":[55],"discretisation.":[56],"decouple":[58],"acoustic":[59,162],"unit":[60,78,90],"discovery":[61,79],"from":[62,84],"speaker":[63,73,97,157],"by":[65,89],"conditioning":[66,158],"AE's":[68],"decoder":[69],"on":[70,82,93,136],"training":[72],"identity.":[74],"At":[75],"test":[76],"time,":[77],"is":[80,104],"performed":[81],"unseen":[86],"speaker,":[87],"followed":[88],"decoding":[91,148],"conditioned":[92],"a":[94,107],"known":[95],"target":[96,115],"obtain":[99],"reconstructed":[100],"filterbanks.":[101],"This":[102],"output":[103],"fed":[105],"vocoder":[109],"synthesise":[111],"speaker's":[116],"voice.":[117],"discretisation,":[119,146],"categorical":[120],"variational":[121],"autoencoders":[122],"(CatVAEs),":[123],"vector-quantised":[124],"VAEs":[125],"(VQ-VAEs)":[126],"straight-through":[128],"estimation":[129],"are":[130],"compared":[131,168],"at":[132],"different":[133],"compression":[134],"levels":[135],"two":[137],"languages.":[138],"Our":[139],"final":[140],"model":[141],"uses":[142],"convolutional":[143],"encoding,":[144],"VQ-VAE":[145],"deconvolutional":[147],"FFTNet":[151],"vocoder.":[152],"show":[154],"that":[155],"decoupled":[156],"intrinsically":[159],"improves":[160],"representations,":[163],"yielding":[164],"competitive":[165],"synthesis":[166],"quality":[167],"challenge":[171],"baseline.":[172]},"counts_by_year":[{"year":2021,"cited_by_count":5},{"year":2020,"cited_by_count":3},{"year":2019,"cited_by_count":2}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
