{"id":"https://openalex.org/W4372341178","doi":"https://doi.org/10.1109/icassp49357.2023.10095431","title":"Autotts: End-to-End Text-to-Speech Synthesis Through Differentiable Duration Modeling","display_name":"Autotts: End-to-End Text-to-Speech Synthesis Through Differentiable Duration Modeling","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372341178","doi":"https://doi.org/10.1109/icassp49357.2023.10095431"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095431","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095431","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040989789","display_name":"Bac Nguyen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bac Nguyen","raw_affiliation_strings":["Stuttgart Laboratory 1,Sony Europe B.V., R&amp;D Center,Germany"],"affiliations":[{"raw_affiliation_string":"Stuttgart Laboratory 1,Sony Europe B.V., R&amp;D Center,Germany","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056623642","display_name":"Fabien Cardinaux","orcid":"https://orcid.org/0000-0003-2921-4873"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fabien Cardinaux","raw_affiliation_strings":["Stuttgart Laboratory 1,Sony Europe B.V., R&amp;D Center,Germany"],"affiliations":[{"raw_affiliation_string":"Stuttgart Laboratory 1,Sony Europe B.V., R&amp;D Center,Germany","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5010554448","display_name":"Stefan Uhlich","orcid":"https://orcid.org/0000-0003-3158-4945"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stefan Uhlich","raw_affiliation_strings":["Stuttgart Laboratory 1,Sony Europe B.V., R&amp;D Center,Germany"],"affiliations":[{"raw_affiliation_string":"Stuttgart Laboratory 1,Sony Europe B.V., R&amp;D Center,Germany","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5040989789"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.6983,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.74380926,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/duration","display_name":"Duration (music)","score":0.7827470302581787},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7201759219169617},{"id":"https://openalex.org/keywords/differentiable-function","display_name":"Differentiable function","score":0.6640615463256836},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6523488759994507},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5911003351211548},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5424819588661194},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5211330056190491},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.47971194982528687},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.4718515872955322},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.4617142677307129},{"id":"https://openalex.org/keywords/monotonic-function","display_name":"Monotonic function","score":0.46013277769088745},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.451631635427475},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.32435035705566406},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.12475648522377014},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.10901382565498352},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.06953838467597961}],"concepts":[{"id":"https://openalex.org/C112758219","wikidata":"https://www.wikidata.org/wiki/Q16038819","display_name":"Duration (music)","level":2,"score":0.7827470302581787},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7201759219169617},{"id":"https://openalex.org/C202615002","wikidata":"https://www.wikidata.org/wiki/Q783507","display_name":"Differentiable function","level":2,"score":0.6640615463256836},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6523488759994507},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5911003351211548},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5424819588661194},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5211330056190491},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.47971194982528687},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.4718515872955322},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.4617142677307129},{"id":"https://openalex.org/C72169020","wikidata":"https://www.wikidata.org/wiki/Q194404","display_name":"Monotonic function","level":2,"score":0.46013277769088745},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.451631635427475},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.32435035705566406},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.12475648522377014},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.10901382565498352},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.06953838467597961},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C124952713","wikidata":"https://www.wikidata.org/wiki/Q8242","display_name":"Literature","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095431","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095431","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6200000047683716,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W2519091744","https://openalex.org/W2591927543","https://openalex.org/W2593414223","https://openalex.org/W2619368999","https://openalex.org/W2747874407","https://openalex.org/W2903739847","https://openalex.org/W2908510526","https://openalex.org/W2913932916","https://openalex.org/W2946200149","https://openalex.org/W2963609956","https://openalex.org/W2963925437","https://openalex.org/W2964167449","https://openalex.org/W2964243274","https://openalex.org/W2972702018","https://openalex.org/W3015338123","https://openalex.org/W3016136182","https://openalex.org/W3026874504","https://openalex.org/W3033411150","https://openalex.org/W3038172701","https://openalex.org/W3092028330","https://openalex.org/W3095883095","https://openalex.org/W3096442195","https://openalex.org/W3097538987","https://openalex.org/W3161296985","https://openalex.org/W3166714886","https://openalex.org/W3169905056","https://openalex.org/W3172148458","https://openalex.org/W3197294703","https://openalex.org/W4200300291","https://openalex.org/W4280561221","https://openalex.org/W4287761884","https://openalex.org/W4320013936","https://openalex.org/W4385245566","https://openalex.org/W6687506355","https://openalex.org/W6736996214","https://openalex.org/W6739901393","https://openalex.org/W6757817989","https://openalex.org/W6763832098","https://openalex.org/W6777694618","https://openalex.org/W6778823374","https://openalex.org/W6779337556","https://openalex.org/W6779871621","https://openalex.org/W6783867762","https://openalex.org/W6787300339","https://openalex.org/W6795261426","https://openalex.org/W6796464841","https://openalex.org/W6838721020","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W1996226968","https://openalex.org/W4313443006","https://openalex.org/W2945374968","https://openalex.org/W4385452045","https://openalex.org/W4293777179","https://openalex.org/W2164070813","https://openalex.org/W2135608140","https://openalex.org/W4224231624","https://openalex.org/W2319626700","https://openalex.org/W3099598016"],"abstract_inverted_index":{"Parallel":[0],"text-to-speech":[1],"(TTS)":[2],"models":[3],"have":[4],"recently":[5],"enabled":[6],"fast":[7],"and":[8,48,91],"highly-natural":[9],"speech":[10,77,83],"synthesis.":[11],"However,":[12],"they":[13,28],"typically":[14],"require":[15],"external":[16],"alignment":[17],"models,":[18],"which":[19],"are":[20,29,115],"not":[21,30],"necessarily":[22],"optimized":[23],"for":[24,42],"the":[25,93],"decoder":[26],"as":[27],"jointly":[31],"trained.":[32],"In":[33],"this":[34,67],"paper,":[35],"we":[36,71],"propose":[37],"a":[38,56,61,74,86,108],"differentiable":[39,68],"duration":[40,69],"method":[41,52],"learning":[43],"monotonic":[44],"alignments":[45],"between":[46],"input":[47],"output":[49],"sequences.":[50],"Our":[51],"is":[53],"based":[54],"on":[55],"soft-duration":[57],"mechanism":[58],"that":[59,100],"optimizes":[60],"stochastic":[62],"process":[63],"in":[64],"expectation.":[65],"Using":[66],"method,":[70],"introduce":[72],"AutoTTS,":[73],"direct":[75],"text-to-waveform":[76],"synthesis":[78,84],"model.":[79],"AutoTTS":[80],"enables":[81],"high-fidelity":[82],"through":[85],"combination":[87],"of":[88],"adversarial":[89],"training":[90,111],"matching":[92],"total":[94],"ground-truth":[95],"duration.":[96],"Experimental":[97],"results":[98,105],"show":[99],"our":[101],"model":[102],"obtains":[103],"competitive":[104],"while":[106],"enjoying":[107],"much":[109],"simpler":[110],"pipeline.":[112],"Audio":[113],"samples":[114],"available":[116],"online":[117],"<sup":[118],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[119],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[120],".":[121]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
