{"id":"https://openalex.org/W2949973225","doi":"https://doi.org/10.21437/ssw.2019-43","title":"Using generative modelling to produce varied intonation for speech synthesis","display_name":"Using generative modelling to produce varied intonation for speech synthesis","publication_year":2019,"publication_date":"2019-09-14","ids":{"openalex":"https://openalex.org/W2949973225","doi":"https://doi.org/10.21437/ssw.2019-43","mag":"2949973225"},"language":"en","primary_location":{"id":"doi:10.21437/ssw.2019-43","is_oa":false,"landing_page_url":"https://doi.org/10.21437/ssw.2019-43","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"10th ISCA Workshop on Speech Synthesis (SSW 10)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1906.04233","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zack Hodari","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zack Hodari","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Oliver Watts","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oliver Watts","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Simon King","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Simon King","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.7225,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.78350128,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"239","last_page":"244"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/intonation","display_name":"Intonation (linguistics)","score":0.7757999897003174},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.7042999863624573},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6406000256538391},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.637499988079071},{"id":"https://openalex.org/keywords/variation","display_name":"Variation (astronomy)","score":0.4781999886035919},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.42910000681877136},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.3425999879837036}],"concepts":[{"id":"https://openalex.org/C2781045179","wikidata":"https://www.wikidata.org/wiki/Q5576720","display_name":"Intonation (linguistics)","level":2,"score":0.7757999897003174},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.7042999863624573},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6406000256538391},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.637499988079071},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6205999851226807},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5407000184059143},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5293999910354614},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.4781999886035919},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.42910000681877136},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.40450000762939453},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.3425999879837036},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.33820000290870667},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29170000553131104},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.2838999927043915},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2797999978065491},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C110121322","wikidata":"https://www.wikidata.org/wiki/Q865811","display_name":"Distribution (mathematics)","level":2,"score":0.2648000121116638},{"id":"https://openalex.org/C149441793","wikidata":"https://www.wikidata.org/wiki/Q200726","display_name":"Probability distribution","level":2,"score":0.2621000111103058}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.21437/ssw.2019-43","is_oa":false,"landing_page_url":"https://doi.org/10.21437/ssw.2019-43","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"10th ISCA Workshop on Speech Synthesis (SSW 10)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1906.04233","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1906.04233","pdf_url":"https://arxiv.org/pdf/1906.04233","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1906.04233","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1906.04233","pdf_url":"https://arxiv.org/pdf/1906.04233","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Unlike":[0],"human":[1],"speakers,":[2],"typical":[3,153],"text-to-speech":[4],"(TTS)":[5],"systems":[6],"are":[7,32],"unable":[8],"to":[9,34,108],"produce":[10,43],"multiple":[11,39,86],"distinct":[12],"renditions":[13,40,45],"of":[14,55,67,111,123,170,188],"a":[15,36],"given":[16],"sentence.":[17],"This":[18],"has":[19],"previously":[20],"been":[21],"addressed":[22],"by":[23,89,118],"adding":[24],"explicit":[25],"external":[26],"control.":[27],"In":[28,64],"contrast,":[29],"generative":[30,81],"models":[31,51,155],"able":[33],"capture":[35],"distribution":[37],"over":[38],"and":[41,147,150],"thus":[42],"varied":[44,135,177],"using":[46],"sampling.":[47],"Typical":[48],"neural":[49],"TTS":[50],"learn":[52],"the":[53,56,65,70,103,109,112,121,124,127,143,168,171,180,185],"average":[54,71,93],"data":[57,106],"because":[58],"they":[59],"minimise":[60],"mean":[61,110],"squared":[62],"error.":[63],"context":[66],"prosody,":[68],"taking":[69],"produces":[72,174],"flatter,":[73],"more":[74,133,176],"boring":[75],"speech:":[76],"an":[77],"\"average":[78],"prosody\".":[79],"A":[80],"model":[82,92,128],"that":[83,117,152],"can":[84,156],"synthesise":[85],"prosodies":[87],"will,":[88],"design,":[90],"not":[91,163],"prosody.":[94],"We":[95,115],"use":[96],"variational":[97],"autoencoders":[98],"(VAEs)":[99],"which":[100],"explicitly":[101],"place":[102],"most":[104],"\"average\"":[105],"close":[107],"Gaussian":[113],"prior.":[114],"propose":[116],"moving":[119],"towards":[120,131],"tails":[122,169],"prior":[125,173],"distribution,":[126],"will":[129],"transition":[130],"generating":[132],"idiosyncratic,":[134],"renditions.":[136],"Focusing":[137],"here":[138],"on":[139],"intonation,":[140],"we":[141],"investigate":[142],"trade-off":[144],"between":[145],"naturalness":[146],"intonation":[148,178],"variation":[149],"find":[151],"acoustic":[154],"either":[157],"be":[158],"natural,":[159],"or":[160],"varied,":[161],"but":[162],"both.":[164],"However,":[165],"sampling":[166],"from":[167],"VAE":[172],"much":[175],"than":[179],"traditional":[181],"approaches,":[182],"whilst":[183],"maintaining":[184],"same":[186],"level":[187],"naturalness.":[189]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2019-06-27T00:00:00"}
