{"id":"https://openalex.org/W4385764360","doi":"https://doi.org/10.24963/ijcai.2023/575","title":"Learning to Speak from Text: Zero-Shot Multilingual Text-to-Speech with Unsupervised Text Pretraining","display_name":"Learning to Speak from Text: Zero-Shot Multilingual Text-to-Speech with Unsupervised Text Pretraining","publication_year":2023,"publication_date":"2023-08-01","ids":{"openalex":"https://openalex.org/W4385764360","doi":"https://doi.org/10.24963/ijcai.2023/575"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2023/575","is_oa":true,"landing_page_url":"https://doi.org/10.24963/ijcai.2023/575","pdf_url":"https://www.ijcai.org/proceedings/2023/0575.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.ijcai.org/proceedings/2023/0575.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025983709","display_name":"Takaaki Saeki","orcid":"https://orcid.org/0000-0001-6003-768X"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Takaaki Saeki","raw_affiliation_strings":["The University of Tokyo","The University of Tokyo, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Tokyo","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010858961","display_name":"Soumi Maiti","orcid":"https://orcid.org/0000-0001-6940-0115"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Soumi Maiti","raw_affiliation_strings":["Carnegie Mellon University","Carnegie Mellon University, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]},{"raw_affiliation_string":"Carnegie Mellon University, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100660610","display_name":"Xinjian Li","orcid":"https://orcid.org/0000-0003-4585-159X"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xinjian Li","raw_affiliation_strings":["Carnegie Mellon University","Carnegie Mellon University, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]},{"raw_affiliation_string":"Carnegie Mellon University, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Carnegie Mellon University","Carnegie Mellon University, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]},{"raw_affiliation_string":"Carnegie Mellon University, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013050263","display_name":"Shinnosuke Takamichi","orcid":"https://orcid.org/0000-0003-0520-7847"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shinnosuke Takamichi","raw_affiliation_strings":["The University of Tokyo","The University of Tokyo, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Tokyo","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003814223","display_name":"Hiroshi Saruwatari","orcid":"https://orcid.org/0000-0003-0876-5617"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hiroshi Saruwatari","raw_affiliation_strings":["The University of Tokyo","The University of Tokyo, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Tokyo","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5025983709"],"corresponding_institution_ids":["https://openalex.org/I74801974"],"apc_list":null,"apc_paid":null,"fwci":2.3194,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.90648326,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"5179","last_page":"5187"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8411032557487488},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6907601952552795},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.657291829586029},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5075589418411255},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.5045324563980103},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4490080177783966},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.41944336891174316},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4095177948474884},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.22377076745033264}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8411032557487488},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6907601952552795},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.657291829586029},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5075589418411255},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.5045324563980103},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4490080177783966},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.41944336891174316},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4095177948474884},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.22377076745033264},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2023/575","is_oa":true,"landing_page_url":"https://doi.org/10.24963/ijcai.2023/575","pdf_url":"https://www.ijcai.org/proceedings/2023/0575.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.24963/ijcai.2023/575","is_oa":true,"landing_page_url":"https://doi.org/10.24963/ijcai.2023/575","pdf_url":"https://www.ijcai.org/proceedings/2023/0575.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2363020693","display_name":null,"funder_award_id":"21H05054","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G7404166632","display_name":"Bridges: From Communities and Data to Workflows and Insight","funder_award_id":"1445606","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8729765517","display_name":null,"funder_award_id":"ACI-1445606","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4385764360.pdf"},"referenced_works_count":43,"referenced_works":["https://openalex.org/W1665214252","https://openalex.org/W2025638820","https://openalex.org/W2093450784","https://openalex.org/W2106568252","https://openalex.org/W2187089797","https://openalex.org/W2494654097","https://openalex.org/W2726515241","https://openalex.org/W2746132399","https://openalex.org/W2767052532","https://openalex.org/W2890964092","https://openalex.org/W2899663614","https://openalex.org/W2903739847","https://openalex.org/W2952037945","https://openalex.org/W2952638691","https://openalex.org/W2952711665","https://openalex.org/W2962780374","https://openalex.org/W2964002616","https://openalex.org/W2964243274","https://openalex.org/W2970925270","https://openalex.org/W2973213036","https://openalex.org/W2973217961","https://openalex.org/W3048217770","https://openalex.org/W3092028330","https://openalex.org/W3096303254","https://openalex.org/W3104723404","https://openalex.org/W3119308075","https://openalex.org/W3163339651","https://openalex.org/W3167533889","https://openalex.org/W3169905056","https://openalex.org/W3177252310","https://openalex.org/W3181257032","https://openalex.org/W3197324626","https://openalex.org/W3206375275","https://openalex.org/W4225956675","https://openalex.org/W4296068816","https://openalex.org/W4296069143","https://openalex.org/W4297841354","https://openalex.org/W4300191749","https://openalex.org/W4311000453","https://openalex.org/W4372272479","https://openalex.org/W4375869005","https://openalex.org/W4385245566","https://openalex.org/W4394666973"],"related_works":["https://openalex.org/W3013650182","https://openalex.org/W2989283631","https://openalex.org/W4249605382","https://openalex.org/W4313491656","https://openalex.org/W3279617","https://openalex.org/W4402958497","https://openalex.org/W1991183963","https://openalex.org/W2053087750","https://openalex.org/W2146390824","https://openalex.org/W2250701745"],"abstract_inverted_index":{"While":[0],"neural":[1],"text-to-speech":[2],"(TTS)":[3],"has":[4],"achieved":[5],"human-like":[6],"natural":[7],"synthetic":[8],"speech,":[9],"multilingual":[10,36,80,92],"TTS":[11,37,54,67,138],"systems":[12,55],"are":[13,64],"limited":[14],"to":[15,19,69],"resource-rich":[16],"languages":[17,58,119],"due":[18],"the":[20,42,51,75,123,129],"need":[21],"for":[22,34,41,56,59,118,148],"paired":[23,102,124],"text":[24],"and":[25],"studio-quality":[26],"audio":[27],"data.":[28,94,131],"This":[29,114],"paper":[30],"proposes":[31],"a":[32,101,105,110,140],"method":[33],"zero-shot":[35,137],"using":[38],"text-only":[39,48,93,130],"data":[40,49,103,125],"target":[43],"language.":[44,151],"The":[45],"use":[46],"of":[47,53,71,79,144],"allows":[50,115],"development":[52],"low-resource":[57],"which":[60],"only":[61],"textual":[62],"resources":[63],"available,":[65],"making":[66],"accessible":[68],"thousands":[70],"languages.":[72],"Inspired":[73],"by":[74],"strong":[76],"cross-lingual":[77],"transferability":[78],"language":[81,88],"models,":[82],"our":[83],"framework":[84],"first":[85],"performs":[86],"masked":[87],"model":[89,99],"pretraining":[90],"with":[91,100,139],"Then":[95],"we":[96],"train":[97],"this":[98],"in":[104,122,128],"supervised":[106],"manner,":[107],"while":[108],"freezing":[109],"language-aware":[111],"embedding":[112],"layer.":[113],"inference":[116],"even":[117],"not":[120],"included":[121],"but":[126],"present":[127],"Evaluation":[132],"results":[133],"demonstrate":[134],"highly":[135],"intelligible":[136],"character":[141],"error":[142],"rate":[143],"less":[145],"than":[146],"12%":[147],"an":[149],"unseen":[150]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":1}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
