{"id":"https://openalex.org/W4399418625","doi":"https://doi.org/10.1145/3652583.3658029","title":"End-to-End Thai Text-to-Speech with Linguistic Unit","display_name":"End-to-End Thai Text-to-Speech with Linguistic Unit","publication_year":2024,"publication_date":"2024-05-30","ids":{"openalex":"https://openalex.org/W4399418625","doi":"https://doi.org/10.1145/3652583.3658029"},"language":"en","primary_location":{"id":"doi:10.1145/3652583.3658029","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3652583.3658029","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3652583.3658029","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3652583.3658029","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5099043658","display_name":"Kontawat Wisetpaitoon","orcid":"https://orcid.org/0009-0007-7927-2779"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kontawat Wisetpaitoon","raw_affiliation_strings":["Kasikorn Labs Co. Ltd., Nonthaburi, Thailand"],"raw_orcid":"https://orcid.org/0009-0007-7927-2779","affiliations":[{"raw_affiliation_string":"Kasikorn Labs Co. Ltd., Nonthaburi, Thailand","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032996702","display_name":"Sattaya Singkul","orcid":"https://orcid.org/0000-0001-7335-7105"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sattaya Singkul","raw_affiliation_strings":["Kasikorn Labs Co. Ltd., Nonthaburi, Thailand"],"raw_orcid":"https://orcid.org/0000-0001-7335-7105","affiliations":[{"raw_affiliation_string":"Kasikorn Labs Co. Ltd., Nonthaburi, Thailand","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090674252","display_name":"Theerat Sakdejayont","orcid":"https://orcid.org/0009-0001-9640-2105"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Theerat Sakdejayont","raw_affiliation_strings":["Kasikorn Labs Co. Ltd., Nonthaburi, Thailand"],"raw_orcid":"https://orcid.org/0009-0001-9640-2105","affiliations":[{"raw_affiliation_string":"Kasikorn Labs Co. Ltd., Nonthaburi, Thailand","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5090165607","display_name":"Tawunrat Chalothorn","orcid":"https://orcid.org/0000-0003-4154-8745"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tawunrat Chalothorn","raw_affiliation_strings":["Kasikorn Labs Co. Ltd., Nonthaburi, Thailand"],"raw_orcid":"https://orcid.org/0000-0003-4154-8745","affiliations":[{"raw_affiliation_string":"Kasikorn Labs Co. Ltd., Nonthaburi, Thailand","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5099043658"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.3311,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.62713947,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"951","last_page":"959"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.8646135330200195},{"id":"https://openalex.org/keywords/trimming","display_name":"Trimming","score":0.7461731433868408},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7350949048995972},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6513475179672241},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5362875461578369},{"id":"https://openalex.org/keywords/tone","display_name":"Tone (literature)","score":0.4837205708026886},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.39811915159225464},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.35061049461364746},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.30090445280075073}],"concepts":[{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.8646135330200195},{"id":"https://openalex.org/C56951928","wikidata":"https://www.wikidata.org/wiki/Q3539213","display_name":"Trimming","level":2,"score":0.7461731433868408},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7350949048995972},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6513475179672241},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5362875461578369},{"id":"https://openalex.org/C2780583480","wikidata":"https://www.wikidata.org/wiki/Q1366327","display_name":"Tone (literature)","level":2,"score":0.4837205708026886},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39811915159225464},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35061049461364746},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.30090445280075073},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3652583.3658029","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3652583.3658029","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3652583.3658029","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3652583.3658029","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3652583.3658029","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3652583.3658029","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.4300000071525574,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399418625.pdf","grobid_xml":"https://content.openalex.org/works/W4399418625.grobid-xml"},"referenced_works_count":16,"referenced_works":["https://openalex.org/W1981457580","https://openalex.org/W2001463936","https://openalex.org/W2125923544","https://openalex.org/W2161906004","https://openalex.org/W2904833599","https://openalex.org/W2964243274","https://openalex.org/W2981087920","https://openalex.org/W2996052604","https://openalex.org/W3009725011","https://openalex.org/W3012423810","https://openalex.org/W3013020904","https://openalex.org/W3116256903","https://openalex.org/W3174090270","https://openalex.org/W3197186969","https://openalex.org/W4297536219","https://openalex.org/W6923381983"],"related_works":["https://openalex.org/W2467235537","https://openalex.org/W4243755427","https://openalex.org/W1493074871","https://openalex.org/W2222099502","https://openalex.org/W2128287377","https://openalex.org/W1979067309","https://openalex.org/W2375590729","https://openalex.org/W2385024427","https://openalex.org/W2978797270","https://openalex.org/W1972676838"],"abstract_inverted_index":{"In":[0],"this":[1],"study,":[2],"we":[3,43,99],"explore":[4],"the":[5,16,48,53,79,112,133,139,144,158],"influence":[6],"of":[7,135,146],"Thai":[8,18,25,102,140],"Linguistic":[9],"Units":[10],"(TH-LUs)":[11],"and":[12,32,52,64,92,127,149,156,169],"speech":[13,56,94,150],"trimming":[14,57,151],"on":[15],"state-of-the-art":[17],"Text-to-Speech":[19],"(TTS)":[20],"systems.We":[21],"propose":[22],"an":[23],"end-to-end":[24],"TTS":[26,153],"framework":[27],"that":[28,109,163],"emphasizes":[29],"phonemes,":[30],"syllables,":[31],"words,":[33],"essential":[34],"for":[35,138,160,165],"accurate":[36],"text":[37],"pronunciation.To":[38],"thoroughly":[39],"investigate":[40],"these":[41],"aspects,":[42],"designed":[44],"two":[45],"main":[46],"experiments:":[47],"TH-LU":[49,54,148],"factor":[50,58],"experiment":[51],"with":[55,111],"experiment.Our":[59],"assessment":[60],"targeted":[61],"speaker":[62,125],"tone":[63,126],"pronunciation":[65,97,128,170],"accuracy.VITS":[66],"model":[67,104,154],"demonstrated":[68],"a":[69,101],"standout":[70],"performer":[71],"in":[72,89,119,123,152],"tonal":[73,167],"accuracy,":[74,98],"which":[75],"is":[76],"evaluated":[77],"by":[78,142],"Speaker":[80],"Encoder":[81],"Cosine":[82],"Similarity":[83],"(SECS)":[84],"method,":[85],"across":[86],"different":[87],"TH-LUs":[88],"both":[90,124,166],"trim":[91],"non-trim":[93],"training":[95],"data.For":[96],"integrated":[100],"speech-to-text":[103],"to":[105],"evaluate.Our":[106],"results":[107],"indicate":[108],"VITS":[110],"word":[113],"linguistic":[114],"unit":[115],"outperforms":[116],"all":[117],"baselines":[118],"overall":[120],"performance,":[121],"excelling":[122],"accuracy.This":[129],"research":[130],"significantly":[131],"advances":[132],"field":[134],"TTS,":[136],"particularly":[137],"language,":[141],"highlighting":[143],"importance":[145],"diverse":[147],"development":[155],"underlining":[157],"need":[159],"evaluation":[161],"methods":[162],"account":[164],"accuracy":[168],"quality.":[171]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
