{"id":"https://openalex.org/W3171055158","doi":"https://doi.org/10.21437/interspeech.2021-1583","title":"Ctrl-P: Temporal Control of Prosodic Variation for Speech Synthesis","display_name":"Ctrl-P: Temporal Control of Prosodic Variation for Speech Synthesis","publication_year":2021,"publication_date":"2021-08-27","ids":{"openalex":"https://openalex.org/W3171055158","doi":"https://doi.org/10.21437/interspeech.2021-1583","mag":"3171055158"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2021-1583","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-1583","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2106.08352","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024033876","display_name":"Devang S Ram Mohan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Devang S. Ram Mohan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017825726","display_name":"Vivian Hu","orcid":"https://orcid.org/0000-0002-5548-9283"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vivian Hu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038517086","display_name":"Tian Huey Teh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian Huey Teh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083188961","display_name":"Alexandra Torresquintero","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Alexandra Torresquintero","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087090605","display_name":"Christopher G. R. Wallis","orcid":"https://orcid.org/0000-0001-7383-4186"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Christopher G.R. Wallis","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008306627","display_name":"Marlene Staib","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Marlene Staib","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068657700","display_name":"Lorenzo Foglianti","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lorenzo Foglianti","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007377741","display_name":"Jiameng Gao","orcid":"https://orcid.org/0000-0003-4161-938X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiameng Gao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5062516688","display_name":"Simon King","orcid":"https://orcid.org/0000-0002-2694-2843"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Simon King","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.1399,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.54146525,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"3875","last_page":"3879"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.865414023399353},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.8455716371536255},{"id":"https://openalex.org/keywords/variation","display_name":"Variation (astronomy)","score":0.7904170751571655},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7224759459495544},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5899616479873657},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5357657074928284},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5227989554405212},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48504894971847534},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4702766537666321},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.4302116334438324},{"id":"https://openalex.org/keywords/latent-variable","display_name":"Latent variable","score":0.41841933131217957},{"id":"https://openalex.org/keywords/energy","display_name":"Energy (signal processing)","score":0.41245460510253906},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3124984800815582},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.11625462770462036},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.06799671053886414}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.865414023399353},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.8455716371536255},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.7904170751571655},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7224759459495544},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5899616479873657},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5357657074928284},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5227989554405212},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48504894971847534},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4702766537666321},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.4302116334438324},{"id":"https://openalex.org/C51167844","wikidata":"https://www.wikidata.org/wiki/Q4422623","display_name":"Latent variable","level":2,"score":0.41841933131217957},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.41245460510253906},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3124984800815582},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.11625462770462036},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.06799671053886414},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C44870925","wikidata":"https://www.wikidata.org/wiki/Q37547","display_name":"Astrophysics","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.21437/interspeech.2021-1583","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-1583","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2106.08352","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2106.08352","pdf_url":"https://arxiv.org/pdf/2106.08352","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},{"id":"mag:3171055158","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2106.08352","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2106.08352","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2106.08352","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2106.08352","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2106.08352","pdf_url":"https://arxiv.org/pdf/2106.08352","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W75704375","https://openalex.org/W172929083","https://openalex.org/W1524333225","https://openalex.org/W1893920648","https://openalex.org/W2069859485","https://openalex.org/W2120847449","https://openalex.org/W2188827208","https://openalex.org/W2766812927","https://openalex.org/W2884607399","https://openalex.org/W2903538854","https://openalex.org/W2945544731","https://openalex.org/W2952269766","https://openalex.org/W2963272440","https://openalex.org/W2963568578","https://openalex.org/W2963927338","https://openalex.org/W2964138190","https://openalex.org/W2964243274","https://openalex.org/W2964307104","https://openalex.org/W2972473628","https://openalex.org/W2973013313","https://openalex.org/W2973158936","https://openalex.org/W2995751273","https://openalex.org/W3033411150","https://openalex.org/W3097003111","https://openalex.org/W3097795905","https://openalex.org/W3097892637","https://openalex.org/W3150572638","https://openalex.org/W3209073848"],"related_works":["https://openalex.org/W3202098869","https://openalex.org/W1524803019","https://openalex.org/W2027324763","https://openalex.org/W3168312215","https://openalex.org/W2101011209","https://openalex.org/W2289824910","https://openalex.org/W2010772370","https://openalex.org/W1977146070","https://openalex.org/W2790551780","https://openalex.org/W2897266704","https://openalex.org/W2461955563","https://openalex.org/W1494785419","https://openalex.org/W2973013313","https://openalex.org/W1925593453","https://openalex.org/W1610605641","https://openalex.org/W2294995564","https://openalex.org/W2037869774","https://openalex.org/W2015428738","https://openalex.org/W2398843493","https://openalex.org/W2058973588"],"abstract_inverted_index":{"Text":[0],"does":[1],"not":[2,23],"fully":[3],"specify":[4],"the":[5,26,33,71,76,88,105,152,178],"spoken":[6],"form,":[7],"so":[8],"text-to-speech":[9],"models":[10],"must":[11],"be":[12,66,114],"able":[13],"to":[14,31,42,65,127,135],"learn":[15,136],"from":[16,119,155,166],"speech":[17,84,159],"data":[18,40],"that":[19,82,130,160,165],"vary":[20],"in":[21,38,75],"ways":[22],"explained":[24],"by":[25],"corresponding":[27],"text.":[28],"One":[29],"way":[30],"reduce":[32],"amount":[34],"of":[35,62,70,93,107,177],"unexplained":[36,72],"variation":[37,73],"training":[39],"is":[41,74,101,161],"provide":[43],"acoustic":[44,56,91,153,180],"information":[45,57],"as":[46],"an":[47],"additional":[48],"learning":[49],"signal.":[50],"When":[51,149],"generating":[52],"speech,":[53],"modifying":[54],"this":[55],"enables":[58],"multiple":[59],"distinct":[60],"renditions":[61],"a":[63,80,128,132,167],"text":[64],"produced.":[67],"Since":[68],"much":[69],"prosody,":[77],"we":[78],"propose":[79],"model":[81,100,129,141,170],"generates":[83,158],"explicitly":[85],"conditioned":[86],"on":[87],"three":[89],"primary":[90],"correlates":[92],"prosody:":[94],"$F_{0}$,":[95],"energy":[96],"and":[97,146],"duration.":[98],"The":[99],"flexible":[102],"about":[103],"how":[104],"values":[106],"these":[108],"features":[109,154,181],"are":[110],"specified:":[111],"they":[112],"can":[113,182],"externally":[115],"provided,":[116],"or":[117,121],"predicted":[118,122,179],"text,":[120,156],"then":[123],"subsequently":[124],"modified.":[125],"Compared":[126],"employs":[131],"variational":[133],"auto-encoder":[134],"unsupervised":[137],"latent":[138],"features,":[139],"our":[140],"provides":[142],"more":[143,162],"interpretable,":[144],"temporally-precise,":[145],"disentangled":[147],"control.":[148],"automatically":[150],"predicting":[151],"it":[157],"natural":[163],"than":[164],"Tacotron":[168],"2":[169],"with":[171],"reference":[172],"encoder.":[173],"Subsequent":[174],"human-in-the-loop":[175],"modification":[176],"significantly":[183],"further":[184],"increase":[185],"naturalness.":[186]},"counts_by_year":[{"year":2022,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
