{"id":"https://openalex.org/W3015614880","doi":"https://doi.org/10.1109/icassp40776.2020.9053704","title":"Improving LPCNET-Based Text-to-Speech with Linear Prediction-Structured Mixture Density Network","display_name":"Improving LPCNET-Based Text-to-Speech with Linear Prediction-Structured Mixture Density Network","publication_year":2020,"publication_date":"2020-04-09","ids":{"openalex":"https://openalex.org/W3015614880","doi":"https://doi.org/10.1109/icassp40776.2020.9053704","mag":"3015614880"},"language":"en","primary_location":{"id":"doi:10.1109/icassp40776.2020.9053704","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053704","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5055817509","display_name":"Min-Jae Hwang","orcid":"https://orcid.org/0000-0002-7376-009X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Min-Jae Hwang","raw_affiliation_strings":["Search Solutions Inc., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"Search Solutions Inc., Seongnam, Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104035145","display_name":"Eunwoo Song","orcid":"https://orcid.org/0000-0003-0642-7083"},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Eunwoo Song","raw_affiliation_strings":["NAVER Corp., Seongnam, Korea"],"affiliations":[{"raw_affiliation_string":"NAVER Corp., Seongnam, Korea","institution_ids":["https://openalex.org/I60922564"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100669485","display_name":"Ryuichi Yamamoto","orcid":"https://orcid.org/0000-0003-0299-5470"},"institutions":[{"id":"https://openalex.org/I4210096607","display_name":"Line Corporation (Japan)","ror":"https://ror.org/00qg8pm87","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210096607","https://openalex.org/I60922564"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Ryuichi Yamamoto","raw_affiliation_strings":["LINE Corp., Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"LINE Corp., Tokyo, Japan","institution_ids":["https://openalex.org/I4210096607"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065394791","display_name":"Frank K. Soong","orcid":"https://orcid.org/0000-0002-9088-3577"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Frank Soong","raw_affiliation_strings":["Microsoft Research Asia, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5056128107","display_name":"Hong-Goo Kang","orcid":"https://orcid.org/0000-0002-6554-0783"},"institutions":[{"id":"https://openalex.org/I193775966","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96","country_code":"KR","type":"education","lineage":["https://openalex.org/I193775966"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hong-Goo Kang","raw_affiliation_strings":["Yonsei Univ., Seoul, Korea"],"affiliations":[{"raw_affiliation_string":"Yonsei Univ., Seoul, Korea","institution_ids":["https://openalex.org/I193775966"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5055817509"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.9279,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.79882521,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"7219","last_page":"7223"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9940000176429749,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/vocal-tract","display_name":"Vocal tract","score":0.7831465005874634},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.764297366142273},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6965605616569519},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.6507566571235657},{"id":"https://openalex.org/keywords/linear-prediction","display_name":"Linear prediction","score":0.6005917191505432},{"id":"https://openalex.org/keywords/speech-production","display_name":"Speech production","score":0.5758036971092224},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.5607548952102661},{"id":"https://openalex.org/keywords/linear-predictive-coding","display_name":"Linear predictive coding","score":0.5233815908432007},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.4439278841018677},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.435529887676239},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.4324910044670105},{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.41113442182540894},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4100157618522644},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3517954349517822},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.1547110676765442},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.112092524766922}],"concepts":[{"id":"https://openalex.org/C47401133","wikidata":"https://www.wikidata.org/wiki/Q748953","display_name":"Vocal tract","level":2,"score":0.7831465005874634},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.764297366142273},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6965605616569519},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.6507566571235657},{"id":"https://openalex.org/C131109320","wikidata":"https://www.wikidata.org/wiki/Q581012","display_name":"Linear prediction","level":2,"score":0.6005917191505432},{"id":"https://openalex.org/C43617652","wikidata":"https://www.wikidata.org/wiki/Q7575399","display_name":"Speech production","level":2,"score":0.5758036971092224},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.5607548952102661},{"id":"https://openalex.org/C59883199","wikidata":"https://www.wikidata.org/wiki/Q1826438","display_name":"Linear predictive coding","level":3,"score":0.5233815908432007},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.4439278841018677},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.435529887676239},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.4324910044670105},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.41113442182540894},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4100157618522644},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3517954349517822},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.1547110676765442},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.112092524766922},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.0},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp40776.2020.9053704","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053704","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6100000143051147,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":39,"referenced_works":["https://openalex.org/W854541894","https://openalex.org/W1522301498","https://openalex.org/W1533861849","https://openalex.org/W1579853615","https://openalex.org/W2102003408","https://openalex.org/W2284050935","https://openalex.org/W2519091744","https://openalex.org/W2751205669","https://openalex.org/W2769810959","https://openalex.org/W2890983311","https://openalex.org/W2929299742","https://openalex.org/W2949382160","https://openalex.org/W2962699518","https://openalex.org/W2963091184","https://openalex.org/W2963300588","https://openalex.org/W2963403868","https://openalex.org/W2963685250","https://openalex.org/W2963782041","https://openalex.org/W2963975282","https://openalex.org/W2964121744","https://openalex.org/W2964243274","https://openalex.org/W2964307104","https://openalex.org/W2972574864","https://openalex.org/W2972597685","https://openalex.org/W2984862052","https://openalex.org/W3015338123","https://openalex.org/W4294619240","https://openalex.org/W4298580827","https://openalex.org/W4385245566","https://openalex.org/W6623517193","https://openalex.org/W6631190155","https://openalex.org/W6631943919","https://openalex.org/W6634817459","https://openalex.org/W6675380101","https://openalex.org/W6695676441","https://openalex.org/W6739901393","https://openalex.org/W6748409065","https://openalex.org/W6753855596","https://openalex.org/W6769767169"],"related_works":["https://openalex.org/W2363056088","https://openalex.org/W2363301696","https://openalex.org/W2808395304","https://openalex.org/W4312036005","https://openalex.org/W1921152853","https://openalex.org/W1994313308","https://openalex.org/W2509918616","https://openalex.org/W2383072803","https://openalex.org/W1570840316","https://openalex.org/W2899769381"],"abstract_inverted_index":{"In":[0],"this":[1,83],"paper,":[2],"we":[3,85,109],"propose":[4,110],"an":[5],"improved":[6],"LPCNet":[7,21,117],"vocoder":[8,22,94,118],"using":[9],"a":[10,34,40,144,151],"linear":[11],"prediction":[12],"(LP)-structured":[13],"mixture":[14],"density":[15,127],"network":[16],"(MDN).":[17],"The":[18,129],"recently":[19],"proposed":[20,135],"has":[23],"successfully":[24],"achieved":[25],"high-quality":[26],"and":[27,69,104],"lightweight":[28],"speech":[29,52,78,141],"synthesis":[30],"systems":[31],"by":[32,64,119,142],"combining":[33],"vocal":[35,42,58,102,105],"tract":[36,103],"LP":[37],"filter":[38],"with":[39,125],"WaveRNN-based":[41],"source":[43,59,106],"(i.e.,":[44],"excitation)":[45],"generator.":[46],"However,":[47],"the":[48,57,65,70,76,91,98,101,113,116,121,134],"quality":[49,139],"of":[50,148],"synthesized":[51],"is":[53,61,72],"often":[54],"unstable":[55],"because":[56],"component":[60],"insufficiently":[62],"represented":[63],"\u03bc-law":[66],"quantization":[67],"method,":[68],"model":[71],"trained":[73],"without":[74],"considering":[75],"entire":[77],"production":[79],"mechanism.":[80],"To":[81],"address":[82],"problem,":[84],"first":[86],"introduce":[87],"LP-MDN,":[88],"which":[89],"enables":[90],"autoregressive":[92],"neural":[93],"to":[95,111,115],"structurally":[96],"represent":[97],"interactions":[99],"between":[100],"components.":[107],"Then,":[108],"incorporate":[112],"LP-MDN":[114],"replacing":[120],"conventional":[122],"discretized":[123],"output":[124],"continuous":[126],"distribution.":[128],"experimental":[130],"results":[131],"verify":[132],"that":[133],"system":[136],"provides":[137],"high":[138],"synthetic":[140],"achieving":[143],"mean":[145],"opinion":[146],"score":[147],"4.41":[149],"within":[150],"text-to-speech":[152],"framework.":[153]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
