{"id":"https://openalex.org/W3096442195","doi":"https://doi.org/10.21437/interspeech.2020-2123","title":"JDI-T: Jointly Trained Duration Informed Transformer for Text-To-Speech without Explicit Alignment","display_name":"JDI-T: Jointly Trained Duration Informed Transformer for Text-To-Speech without Explicit Alignment","publication_year":2020,"publication_date":"2020-10-25","ids":{"openalex":"https://openalex.org/W3096442195","doi":"https://doi.org/10.21437/interspeech.2020-2123","mag":"3096442195"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2020-2123","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-2123","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027636448","display_name":"Dan Lim","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dan Lim","raw_affiliation_strings":["Kakao Corp., Seongnam, Korea","Kakao Enterprise Corp., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"Kakao Corp., Seongnam, Korea","institution_ids":[]},{"raw_affiliation_string":"Kakao Enterprise Corp., Seongnam, Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103192158","display_name":"Won Jang","orcid":"https://orcid.org/0000-0002-4711-780X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Won Jang","raw_affiliation_strings":["Kakao Enterprise Corp., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"Kakao Enterprise Corp., Seongnam, Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006362544","display_name":"O Gyeonghwan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gyeonghwan O","raw_affiliation_strings":["Kakao Enterprise Corp., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"Kakao Enterprise Corp., Seongnam, Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027073468","display_name":"Heayoung Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heayoung Park","raw_affiliation_strings":["Kakao Enterprise Corp., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"Kakao Enterprise Corp., Seongnam, Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061409578","display_name":"Bong\u2010Wan Kim","orcid":"https://orcid.org/0000-0001-9059-0451"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bongwan Kim","raw_affiliation_strings":["Kakao Enterprise Corp., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"Kakao Enterprise Corp., Seongnam, Korea","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5050154985","display_name":"Jaesam Yoon","orcid":"https://orcid.org/0000-0002-9978-0582"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jaesam Yoon","raw_affiliation_strings":["Kakao Enterprise Corp., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"Kakao Enterprise Corp., Seongnam, Korea","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5027636448"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":3.5338,"has_fulltext":false,"cited_by_count":32,"citation_normalized_percentile":{"value":0.94130517,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4004","last_page":"4008"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6980191469192505},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6868584156036377},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5856418609619141},{"id":"https://openalex.org/keywords/duration","display_name":"Duration (music)","score":0.5300264954566956},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.505563497543335},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4001687169075012},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.15699061751365662},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.1474258005619049},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.11060300469398499},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08953052759170532}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6980191469192505},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6868584156036377},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5856418609619141},{"id":"https://openalex.org/C112758219","wikidata":"https://www.wikidata.org/wiki/Q16038819","display_name":"Duration (music)","level":2,"score":0.5300264954566956},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.505563497543335},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4001687169075012},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.15699061751365662},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.1474258005619049},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.11060300469398499},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08953052759170532},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2020-2123","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-2123","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.4300000071525574,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W2120847449","https://openalex.org/W2191779130","https://openalex.org/W2519091744","https://openalex.org/W2767052532","https://openalex.org/W2886769154","https://openalex.org/W2903739847","https://openalex.org/W2942807473","https://openalex.org/W2946200149","https://openalex.org/W2963300588","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2971753973","https://openalex.org/W2971905065","https://openalex.org/W3015338123","https://openalex.org/W3015922793","https://openalex.org/W3016136182","https://openalex.org/W3016160783","https://openalex.org/W4288265053","https://openalex.org/W4295312788","https://openalex.org/W4385245566"],"related_works":["https://openalex.org/W2348837382","https://openalex.org/W746329893","https://openalex.org/W4205872570","https://openalex.org/W4245971243","https://openalex.org/W1922805944","https://openalex.org/W4253588120","https://openalex.org/W2383732295","https://openalex.org/W2942717012","https://openalex.org/W4248716494","https://openalex.org/W3204019825"],"abstract_inverted_index":{"We":[0],"propose":[1],"Jointly":[2],"trained":[3,16,142],"Duration":[4],"Informed":[5],"Transformer":[6,10,69,104],"(JDI-T),":[7],"a":[8,12,58,87,108,114],"feed-forward":[9,103],"with":[11],"duration":[13,41,65,89,111],"predictor":[14],"jointly":[15,100],"without":[17,105],"explicit":[18],"alignments":[19],"in":[20,113],"order":[21],"to":[22,57,99,136],"generate":[23],"an":[24,29],"acoustic":[25],"feature":[26],"sequence":[27],"from":[28,66],"input":[30],"text.In":[31],"this":[32],"work,":[33],"inspired":[34],"by":[35,143],"the":[36,40,63,67,71,74,80,96,102,119,122,126,137],"recent":[37],"success":[38],"of":[39,78,121],"informed":[42],"networks":[43],"such":[44],"as":[45,86],"FastSpeech":[46],"and":[47,83],"DurIAN,":[48],"we":[49,61],"further":[50],"simplify":[51],"its":[52],"sequential,":[53],"two-stage":[54],"training":[55,76,116],"pipeline":[56],"single-stage":[59],"training.Specifically,":[60],"extract":[62],"phoneme":[64,88,110],"autoregressive":[68,81],"on":[70,107,125],"fly":[72],"during":[73],"joint":[75],"instead":[77],"pretraining":[79],"model":[82,124],"using":[84],"it":[85,94],"extractor.To":[90],"our":[91],"best":[92],"knowledge,":[93],"is":[95],"first":[97],"implementation":[98],"train":[101],"relying":[106],"pre-trained":[109],"extractor":[112],"single":[115],"pipeline.We":[117],"evaluate":[118],"effectiveness":[120],"proposed":[123],"publicly":[127],"available":[128],"Korean":[129],"Single":[130],"speaker":[131],"Speech":[132],"(KSS)":[133],"dataset":[134],"compared":[135],"baseline":[138],"text-to-speech":[139],"(TTS)":[140],"models":[141],"ESPnet-TTS.":[144]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":5},{"year":2021,"cited_by_count":15}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
