{"id":"https://openalex.org/W4386536196","doi":"https://doi.org/10.1109/taslp.2023.3313424","title":"Pronunciation Dictionary-Free Multilingual Speech Synthesis Using Learned Phonetic Representations","display_name":"Pronunciation Dictionary-Free Multilingual Speech Synthesis Using Learned Phonetic Representations","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4386536196","doi":"https://doi.org/10.1109/taslp.2023.3313424"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3313424","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3313424","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101983531","display_name":"Chang Liu","orcid":"https://orcid.org/0009-0003-1661-1113"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chang Liu","raw_affiliation_strings":["National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0009-0003-1661-1113","affiliations":[{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059767940","display_name":"Zhen-Hua Ling","orcid":"https://orcid.org/0000-0001-7853-5273"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhen-Hua Ling","raw_affiliation_strings":["National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0001-7853-5273","affiliations":[{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101593418","display_name":"Ling-Hui Chen","orcid":"https://orcid.org/0009-0009-4247-4128"},"institutions":[{"id":"https://openalex.org/I173632517","display_name":"MediaTek (China)","ror":"https://ror.org/05xvgy636","country_code":"CN","type":"company","lineage":["https://openalex.org/I173632517","https://openalex.org/I4210148979"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ling-Hui Chen","raw_affiliation_strings":["iFLYTEK Research, iFLYTEK Company Ltd., Hefei, China","iFLYTEK Research, iFLYTEK Company Limited, Hefei, China"],"raw_orcid":"https://orcid.org/0009-0009-4247-4128","affiliations":[{"raw_affiliation_string":"iFLYTEK Research, iFLYTEK Company Ltd., Hefei, China","institution_ids":["https://openalex.org/I173632517"]},{"raw_affiliation_string":"iFLYTEK Research, iFLYTEK Company Limited, Hefei, China","institution_ids":["https://openalex.org/I173632517"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101983531"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.6816,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.75788189,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":97},"biblio":{"volume":"31","issue":null,"first_page":"3706","last_page":"3716"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.9717000126838684,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9617000222206116,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8035489320755005},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.7449445724487305},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.670025110244751},{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.6129123568534851},{"id":"https://openalex.org/keywords/connectionism","display_name":"Connectionism","score":0.5586724281311035},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5355745553970337},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.533809244632721},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5329946279525757},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4329453706741333},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.2370557188987732},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.11562833189964294}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8035489320755005},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.7449445724487305},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.670025110244751},{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.6129123568534851},{"id":"https://openalex.org/C8521452","wikidata":"https://www.wikidata.org/wiki/Q203790","display_name":"Connectionism","level":3,"score":0.5586724281311035},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5355745553970337},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.533809244632721},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5329946279525757},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4329453706741333},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.2370557188987732},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.11562833189964294},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2023.3313424","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3313424","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7699999809265137,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":65,"referenced_works":["https://openalex.org/W854541894","https://openalex.org/W1570629387","https://openalex.org/W2012897754","https://openalex.org/W2049686551","https://openalex.org/W2123237149","https://openalex.org/W2127141656","https://openalex.org/W2507912506","https://openalex.org/W2514457011","https://openalex.org/W2747874407","https://openalex.org/W2752796333","https://openalex.org/W2767052532","https://openalex.org/W2805993470","https://openalex.org/W2884873108","https://openalex.org/W2888922217","https://openalex.org/W2896457183","https://openalex.org/W2897353073","https://openalex.org/W2903739847","https://openalex.org/W2933138175","https://openalex.org/W2938583109","https://openalex.org/W2946200149","https://openalex.org/W2963609956","https://openalex.org/W2963799213","https://openalex.org/W2964243274","https://openalex.org/W2979476256","https://openalex.org/W3016139610","https://openalex.org/W3033411150","https://openalex.org/W3034772996","https://openalex.org/W3036601975","https://openalex.org/W3092028330","https://openalex.org/W3099078140","https://openalex.org/W3140429000","https://openalex.org/W3161417069","https://openalex.org/W3161695192","https://openalex.org/W3173767661","https://openalex.org/W3174758275","https://openalex.org/W3198048667","https://openalex.org/W3198429080","https://openalex.org/W3209059054","https://openalex.org/W4200300291","https://openalex.org/W4211221265","https://openalex.org/W4224629880","https://openalex.org/W4226132755","https://openalex.org/W4281760581","https://openalex.org/W4287173589","https://openalex.org/W4297808394","https://openalex.org/W4297841714","https://openalex.org/W4302213377","https://openalex.org/W4372260509","https://openalex.org/W4385245566","https://openalex.org/W4388400684","https://openalex.org/W6623517193","https://openalex.org/W6736996214","https://openalex.org/W6739901393","https://openalex.org/W6752124048","https://openalex.org/W6755207826","https://openalex.org/W6763832098","https://openalex.org/W6769196770","https://openalex.org/W6772381481","https://openalex.org/W6778823374","https://openalex.org/W6780218876","https://openalex.org/W6783867762","https://openalex.org/W6795952400","https://openalex.org/W6796730497","https://openalex.org/W6844194202","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W2530685530","https://openalex.org/W4375868962","https://openalex.org/W2088854863","https://openalex.org/W2011227383","https://openalex.org/W2065606036","https://openalex.org/W1976719989","https://openalex.org/W2942893872","https://openalex.org/W3179495260","https://openalex.org/W3127543252","https://openalex.org/W2016904525"],"abstract_inverted_index":{"This":[0],"paper":[1],"presents":[2],"a":[3,41,49,57,90,97,120,130,199],"multilingual":[4,147,200],"speech":[5,52,71,148],"synthesis":[6],"approach":[7],"that":[8,179],"leverages":[9],"learned":[10,24],"phonetic":[11,25,30,35],"representations":[12,26,31,36,85,128],"to":[13,65,132,167],"eliminate":[14],"the":[15,70,126,136,150,154,158,183,195,204],"need":[16],"for":[17,143],"pronunciation":[18],"dictionaries":[19],"in":[20,146],"target":[21,74,176],"languages.":[22,75],"The":[23,101,111],"consist":[27],"of":[28,73,138,152,206],"unsupervised":[29],"(UPR)":[32],"and":[33,81,96,106,123,157,193],"supervised":[34],"(SPR).":[37],"To":[38],"extract":[39],"UPRs,":[40],"pre-trained":[42,163],"wav2vec":[43],"2.0":[44],"model":[45,55,78,170,197],"is":[46,63,86],"utilized,":[47],"while":[48],"language-independent":[50],"automatic":[51],"recognition":[53],"(LI-ASR)":[54],"with":[56,117],"connectionist":[58],"temporal":[59],"classification":[60],"(CTC)":[61],"loss":[62],"employed":[64],"derive":[66],"segment-level":[67],"SPRs":[68,82,107,118],"from":[69,108,188],"data":[72],"An":[76],"acoustic":[77,196],"using":[79,119,198],"UPRs":[80,105,116],"as":[83],"intermediate":[84],"then":[87,124],"designed,":[88],"comprising":[89],"UPR":[91],"predictor,":[92,95],"an":[93],"SPR":[94],"representation-to-mel-spectrogram":[98],"(RTM)":[99],"converter.":[100],"two":[102,155],"predictors":[103,156],"generate":[104],"texts,":[109],"respectively.":[110],"RTM":[112,159],"converter":[113,160],"first":[114],"combines":[115],"Transformer-based":[121],"encoder,":[122],"feeds":[125],"merged":[127],"into":[129],"decoder":[131],"produce":[133],"mel-spectrograms.":[134],"Considering":[135],"difficulty":[137],"collecting":[139],"large":[140],"training":[141],"corpora":[142],"all":[144],"languages":[145,166,177],"synthesis,":[149],"parameters":[151],"both":[153],"can":[161],"be":[162],"on":[164,174],"non-target":[165],"further":[168,202],"improve":[169],"performance.":[171],"Experimental":[172],"results":[173],"six":[175],"demonstrate":[178],"our":[180],"method":[181],"outperformed":[182],"approaches":[184],"directly":[185],"predicting":[186],"mel-spectrograms":[187],"character":[189],"or":[190],"phoneme":[191],"sequences,":[192],"pre-training":[194],"corpus":[201],"improved":[203],"performance":[205],"synthetic":[207],"speech.":[208]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
