{"id":"https://openalex.org/W4225276642","doi":"https://doi.org/10.1109/icassp43922.2022.9746282","title":"Multilingual Text-To-Speech Training Using Cross Language Voice Conversion And Self-Supervised Learning Of Speech Representations","display_name":"Multilingual Text-To-Speech Training Using Cross Language Voice Conversion And Self-Supervised Learning Of Speech Representations","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W4225276642","doi":"https://doi.org/10.1109/icassp43922.2022.9746282"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9746282","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746282","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057239023","display_name":"Jilong Wu","orcid":"https://orcid.org/0009-0007-8000-347X"},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]}],"countries":["IL"],"is_corresponding":true,"raw_author_name":"Jilong Wu","raw_affiliation_strings":["Facebook AI"],"affiliations":[{"raw_affiliation_string":"Facebook AI","institution_ids":["https://openalex.org/I2252078561"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038017098","display_name":"Adam Polyak","orcid":"https://orcid.org/0000-0003-2563-2111"},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Adam Polyak","raw_affiliation_strings":["Facebook AI"],"affiliations":[{"raw_affiliation_string":"Facebook AI","institution_ids":["https://openalex.org/I2252078561"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070563684","display_name":"Yaniv Taigman","orcid":null},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Yaniv Taigman","raw_affiliation_strings":["Facebook AI"],"affiliations":[{"raw_affiliation_string":"Facebook AI","institution_ids":["https://openalex.org/I2252078561"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043162691","display_name":"Jason Fong","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jason Fong","raw_affiliation_strings":["The University of Edinburgh"],"affiliations":[{"raw_affiliation_string":"The University of Edinburgh","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043514543","display_name":"Prabhav Agrawal","orcid":null},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Prabhav Agrawal","raw_affiliation_strings":["Facebook AI"],"affiliations":[{"raw_affiliation_string":"Facebook AI","institution_ids":["https://openalex.org/I2252078561"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102191169","display_name":"Qing He","orcid":null},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Qing He","raw_affiliation_strings":["Facebook AI"],"affiliations":[{"raw_affiliation_string":"Facebook AI","institution_ids":["https://openalex.org/I2252078561"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5057239023"],"corresponding_institution_ids":["https://openalex.org/I2252078561"],"apc_list":null,"apc_paid":null,"fwci":1.3504,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.82492997,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"8017","last_page":"8021"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7947706580162048},{"id":"https://openalex.org/keywords/speech-corpus","display_name":"Speech corpus","score":0.6077654361724854},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5969853401184082},{"id":"https://openalex.org/keywords/german","display_name":"German","score":0.5570636987686157},{"id":"https://openalex.org/keywords/polyglot","display_name":"Polyglot","score":0.5526509881019592},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5088096261024475},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.47954556345939636},{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.42758506536483765},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.41107672452926636},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.3983202874660492},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.292915403842926}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7947706580162048},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.6077654361724854},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5969853401184082},{"id":"https://openalex.org/C154775046","wikidata":"https://www.wikidata.org/wiki/Q188","display_name":"German","level":2,"score":0.5570636987686157},{"id":"https://openalex.org/C2780239667","wikidata":"https://www.wikidata.org/wiki/Q2102850","display_name":"Polyglot","level":2,"score":0.5526509881019592},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5088096261024475},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.47954556345939636},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.42758506536483765},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.41107672452926636},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.3983202874660492},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.292915403842926},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9746282","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746282","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.8299999833106995}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W2202109488","https://openalex.org/W2408526116","https://openalex.org/W2755348046","https://openalex.org/W2794235490","https://openalex.org/W2903739847","https://openalex.org/W2962788625","https://openalex.org/W2964167449","https://openalex.org/W2964243274","https://openalex.org/W2972473628","https://openalex.org/W2977798327","https://openalex.org/W3036601975","https://openalex.org/W3092028330","https://openalex.org/W3095948607","https://openalex.org/W3096323553","https://openalex.org/W3097032879","https://openalex.org/W3115632755","https://openalex.org/W3140429000","https://openalex.org/W3141523618","https://openalex.org/W3160248152","https://openalex.org/W3163296124","https://openalex.org/W3198429080","https://openalex.org/W4298580827","https://openalex.org/W6748409065","https://openalex.org/W6779919476","https://openalex.org/W6780218876","https://openalex.org/W6783867762","https://openalex.org/W6787860843"],"related_works":["https://openalex.org/W2482180524","https://openalex.org/W2748879498","https://openalex.org/W2772686614","https://openalex.org/W3164858600","https://openalex.org/W3109498233","https://openalex.org/W2031768607","https://openalex.org/W2482338111","https://openalex.org/W2755348046","https://openalex.org/W4200068392","https://openalex.org/W2611833102"],"abstract_inverted_index":{"State":[0],"of":[1,47,126,170],"the":[2,23,98,124,127,168],"art":[3],"text-to-speech":[4],"(TTS)":[5],"models":[6],"can":[7],"generate":[8],"high":[9],"fidelity":[10],"monolingual":[11,146],"speech,":[12],"but":[13],"it":[14],"is":[15,29,51,86],"still":[16],"challenging":[17],"to":[18,35,117,123],"synthesize":[19],"multilingual":[20,65,119,140],"speech":[21,82,121],"from":[22,80,133],"same":[24],"speaker.":[25,129],"One":[26,45],"major":[27],"hurdle":[28],"for":[30,143,173,180,185,190,196],"training":[31],"data.":[32],"It\u2019s":[33],"hard":[34],"find":[36],"speakers":[37],"who":[38],"have":[39],"native":[40,145],"proficiency":[41],"in":[42],"several":[43],"languages.":[44],"way":[46],"mitigating":[48],"this":[49,60,115],"issue":[50],"by":[52],"generating":[53],"polyglot":[54],"corpus":[55],"through":[56,68,91],"voice":[57,72,125],"conversion.":[58],"In":[59],"paper,":[61],"we":[62,102,137],"train":[63,138],"such":[64],"TTS":[66,141],"system":[67,142,159,172],"a":[69,81,105,139,144],"novel":[70],"cross-lingual":[71],"conversion":[73],"model":[74,84,116],"trained":[75],"with":[76,88,167],"speaker-invariant":[77],"features":[78],"extracted":[79],"representation":[83],"which":[85,149],"pre-trained":[87],"53":[89],"languages":[90],"self-supervised":[92],"learning":[93],"[1].":[94],"To":[95],"further":[96],"improve":[97],"speaker":[99,106,148],"identity":[100],"shift,":[101],"also":[103],"adopt":[104],"similarity":[107],"loss":[108],"term":[109],"during":[110],"training.":[111],"We":[112],"then":[113],"use":[114],"convert":[118],"multi-speaker":[120,171],"data":[122,132],"target":[128],"Through":[130],"augmenting":[131],"4":[134],"other":[135],"languages,":[136,175],"English":[147],"speaks":[150],"5":[151],"languages(English,":[152],"French,":[153],"German,":[154,186],"Italian":[155],"and":[156,192],"Spanish).":[157],"Our":[158],"achieves":[160],"improved":[161],"mean":[162],"opinion":[163],"score":[164],"(MOS)":[165],"compared":[166],"baseline":[169],"all":[174],"specifically:":[176],"3.74":[177],"vs":[178,183,188,194],"3.62":[179],"Spanish,":[181],"3.11":[182],"2.71":[184],"3.47":[187],"2.84":[189],"Italian,":[191],"2.72":[193],"2.41":[195],"French.":[197]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
