{"id":"https://openalex.org/W7138920180","doi":"https://doi.org/10.1609/aaai.v40i41.40766","title":"EA-VAE: Learning to Reconstruct Dysarthric Speech via Variational Autoencoder with Encoding Alignment","display_name":"EA-VAE: Learning to Reconstruct Dysarthric Speech via Variational Autoencoder with Encoding Alignment","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138920180","doi":"https://doi.org/10.1609/aaai.v40i41.40766"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i41.40766","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i41.40766","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/40766/44727","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/40766/44727","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011869875","display_name":"D. W. Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Daipeng Zhang","raw_affiliation_strings":["School of New Media and Communication, Tianjin University, Tianjin, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of New Media and Communication, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130024651","display_name":"Wenhuan Lu","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenhuan Lu","raw_affiliation_strings":["School of New Media and Communication, Tianjin University, Tianjin, China\nCollege of Intelligence and Computing, Tianjin University, Tianjin, China\nSchool of Intelligence Science and Engineering, Qinghai Minzu University, Xining, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of New Media and Communication, Tianjin University, Tianjin, China\nCollege of Intelligence and Computing, Tianjin University, Tianjin, China\nSchool of Intelligence Science and Engineering, Qinghai Minzu University, Xining, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060201191","display_name":"Xianghu Yue","orcid":"https://orcid.org/0000-0003-3527-6034"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xianghu Yue","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University, Tianjin, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114910380","display_name":"Hongcheng Zhang","orcid":"https://orcid.org/0009-0008-7234-8475"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongcheng Zhang","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University, Tianjin, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129937409","display_name":"Jianguo Wei","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianguo Wei","raw_affiliation_strings":["School of New Media and Communication, Tianjin University, Tianjin, China\nCollege of Intelligence and Computing, Tianjin University, Tianjin, China\nSchool of Intelligence Science and Engineering, Qinghai Minzu University, Xining, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of New Media and Communication, Tianjin University, Tianjin, China\nCollege of Intelligence and Computing, Tianjin University, Tianjin, China\nSchool of Intelligence Science and Engineering, Qinghai Minzu University, Xining, China","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5011869875"],"corresponding_institution_ids":["https://openalex.org/I162868743"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.47368421,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"41","first_page":"34656","last_page":"34664"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.9514999985694885,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.9514999985694885,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.009999999776482582,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12684","display_name":"Stuttering Research and Treatment","score":0.009200000204145908,"subfield":{"id":"https://openalex.org/subfields/3203","display_name":"Clinical Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.6319000124931335},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.6223999857902527},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6220999956130981},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.477400004863739},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.44780001044273376},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4458000063896179},{"id":"https://openalex.org/keywords/dysarthria","display_name":"Dysarthria","score":0.4242999851703644},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.41850000619888306}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7598999738693237},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6481000185012817},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.6319000124931335},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.6223999857902527},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6220999956130981},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.477400004863739},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.44780001044273376},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4458000063896179},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4424999952316284},{"id":"https://openalex.org/C2777639682","wikidata":"https://www.wikidata.org/wiki/Q225957","display_name":"Dysarthria","level":2,"score":0.4242999851703644},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.41850000619888306},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.40070000290870667},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.3788999915122986},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3675999939441681},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3555999994277954},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.34150001406669617},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.3077999949455261},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C3018392663","wikidata":"https://www.wikidata.org/wiki/Q202064","display_name":"Speech sound","level":2,"score":0.28029999136924744},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2712000012397766},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.26109999418258667}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i41.40766","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i41.40766","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/40766/44727","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i41.40766","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i41.40766","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/40766/44727","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.4287608861923218,"id":"https://metadata.un.org/sdg/16"}],"awards":[{"id":"https://openalex.org/G2298336378","display_name":null,"funder_award_id":"2023YFB2603902","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138920180.pdf","grobid_xml":"https://content.openalex.org/works/W7138920180.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Dysarthric":[0],"speech":[1,18,44,49,53,61,100,118,161],"reconstruction":[2,58],"(DSR)":[3],"aims":[4],"to":[5,116],"enhance":[6,56],"the":[7,16,37,57,71,89,92,94,98,106,125,143,151,156],"intelligibility":[8],"of":[9,60,80,91,105,155],"dysarthric":[10,17,43,99,160],"speech.":[11,109],"Compared":[12],"with":[13,102,119,134],"normal":[14,41,108],"speech,":[15,29],"is":[19,113],"characterized":[20],"by":[21],"its":[22],"pathological":[23],"features,":[24,83],"including":[25],"discontinuous":[26],"pronunciation,":[27],"slow":[28],"hoarseness,":[30],"and":[31,42,86,142,153],"improper":[32],"pauses.":[33],"Significant":[34],"disparities":[35],"in":[36,47,159],"feature":[38,62],"space":[39],"between":[40],"may":[45],"result":[46],"suboptimal":[48],"reconstruction,":[50],"thereby":[51],"degrading":[52],"intelligibility.":[54,121],"To":[55],"ability":[59],"spaces,":[63],"this":[64],"paper":[65],"proposes":[66],"a":[67,103,135],"DSR":[68],"model":[69,95],"named":[70],"Encoding-Aligned":[72],"Variational":[73],"Autoencoder":[74],"(EA-VAE).":[75],"By":[76],"incorporating":[77],"alignment":[78],"modules":[79],"frame-level":[81],"embedding":[82],"prior":[84],"distributions,":[85],"duration":[87],"into":[88],"encoder":[90],"VAE,":[93],"explicitly":[96],"aligns":[97],"encoding":[101],"representation":[104],"parallel":[107],"A":[110],"shared":[111],"decoder":[112],"then":[114],"used":[115],"generate":[117],"improved":[120],"Experimental":[122],"results":[123],"on":[124],"UASpeech":[126],"benchmark":[127],"confirm":[128],"that":[129],"EA-VAE":[130],"achieves":[131],"state-of-the-art":[132],"performance,":[133],"31.7%":[136],"relative":[137],"word":[138],"error":[139],"rate":[140],"reduction":[141],"highest":[144],"subjective":[145],"MOS":[146],"score":[147],"(4.48),":[148],"thoroughly":[149],"validating":[150],"effectiveness":[152],"advancements":[154],"proposed":[157],"method":[158],"reconstruction.":[162]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2026-03-20T00:00:00"}
