{"id":"https://openalex.org/W4372260365","doi":"https://doi.org/10.1109/icassp49357.2023.10096929","title":"CROSSSPEECH: Speaker-Independent Acoustic Representation for Cross-Lingual Speech Synthesis","display_name":"CROSSSPEECH: Speaker-Independent Acoustic Representation for Cross-Lingual Speech Synthesis","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372260365","doi":"https://doi.org/10.1109/icassp49357.2023.10096929"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096929","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096929","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100377802","display_name":"Ji\u2010Hoon Kim","orcid":"https://orcid.org/0000-0002-5212-1686"},"institutions":[{"id":"https://openalex.org/I4210086063","display_name":"Handok (South Korea)","ror":"https://ror.org/00exehz38","country_code":"KR","type":"company","lineage":["https://openalex.org/I4210086063"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Ji-Hoon Kim","raw_affiliation_strings":["42dot Inc.,Seoul,Republic of Korea","42dot Inc., Seoul, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"42dot Inc.,Seoul,Republic of Korea","institution_ids":["https://openalex.org/I4210086063"]},{"raw_affiliation_string":"42dot Inc., Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069744293","display_name":"Hong-Sun Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210086063","display_name":"Handok (South Korea)","ror":"https://ror.org/00exehz38","country_code":"KR","type":"company","lineage":["https://openalex.org/I4210086063"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hong-Sun Yang","raw_affiliation_strings":["42dot Inc.,Seoul,Republic of Korea","42dot Inc., Seoul, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"42dot Inc.,Seoul,Republic of Korea","institution_ids":["https://openalex.org/I4210086063"]},{"raw_affiliation_string":"42dot Inc., Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059619263","display_name":"Yoon-Cheol Ju","orcid":null},"institutions":[{"id":"https://openalex.org/I4210086063","display_name":"Handok (South Korea)","ror":"https://ror.org/00exehz38","country_code":"KR","type":"company","lineage":["https://openalex.org/I4210086063"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Yoon-Cheol Ju","raw_affiliation_strings":["42dot Inc.,Seoul,Republic of Korea","42dot Inc., Seoul, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"42dot Inc.,Seoul,Republic of Korea","institution_ids":["https://openalex.org/I4210086063"]},{"raw_affiliation_string":"42dot Inc., Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101635106","display_name":"Il-Hwan Kim","orcid":"https://orcid.org/0000-0003-3554-8845"},"institutions":[{"id":"https://openalex.org/I4210086063","display_name":"Handok (South Korea)","ror":"https://ror.org/00exehz38","country_code":"KR","type":"company","lineage":["https://openalex.org/I4210086063"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Il-Hwan Kim","raw_affiliation_strings":["42dot Inc.,Seoul,Republic of Korea","42dot Inc., Seoul, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"42dot Inc.,Seoul,Republic of Korea","institution_ids":["https://openalex.org/I4210086063"]},{"raw_affiliation_string":"42dot Inc., Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053784244","display_name":"Byeong-Yeol Kim","orcid":"https://orcid.org/0000-0001-6019-5047"},"institutions":[{"id":"https://openalex.org/I4210086063","display_name":"Handok (South Korea)","ror":"https://ror.org/00exehz38","country_code":"KR","type":"company","lineage":["https://openalex.org/I4210086063"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Byeong-Yeol Kim","raw_affiliation_strings":["42dot Inc.,Seoul,Republic of Korea","42dot Inc., Seoul, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"42dot Inc.,Seoul,Republic of Korea","institution_ids":["https://openalex.org/I4210086063"]},{"raw_affiliation_string":"42dot Inc., Seoul, Republic of Korea","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.3053,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.83797775,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9945999979972839,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7755832076072693},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.704204261302948},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.581695556640625},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.5681090354919434},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5498785376548767},{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.515719473361969},{"id":"https://openalex.org/keywords/variation","display_name":"Variation (astronomy)","score":0.49928832054138184},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4871222972869873},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.48018231987953186},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4283861219882965},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.42376935482025146},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4233013391494751},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.42017924785614014},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3679054379463196},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.1066858172416687},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.06971067190170288},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.06005430221557617}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7755832076072693},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.704204261302948},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.581695556640625},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.5681090354919434},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5498785376548767},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.515719473361969},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.49928832054138184},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4871222972869873},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.48018231987953186},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4283861219882965},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.42376935482025146},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4233013391494751},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.42017924785614014},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3679054379463196},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.1066858172416687},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.06971067190170288},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.06005430221557617},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C44870925","wikidata":"https://www.wikidata.org/wiki/Q37547","display_name":"Astrophysics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096929","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096929","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.4300000071525574}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2187089797","https://openalex.org/W2890964092","https://openalex.org/W2964002616","https://openalex.org/W2964243274","https://openalex.org/W2972473628","https://openalex.org/W3048217770","https://openalex.org/W3096514088","https://openalex.org/W3150572638","https://openalex.org/W3161436426","https://openalex.org/W3196001064","https://openalex.org/W3196468212","https://openalex.org/W3197186640","https://openalex.org/W4200300291","https://openalex.org/W4225320802","https://openalex.org/W4280542470","https://openalex.org/W4283640572","https://openalex.org/W4285189120","https://openalex.org/W4296068812","https://openalex.org/W4297841773","https://openalex.org/W4313011609","https://openalex.org/W4385245566","https://openalex.org/W6758853722","https://openalex.org/W6762287338","https://openalex.org/W6789380839"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2149220986","https://openalex.org/W1493012537","https://openalex.org/W4247736853","https://openalex.org/W2162158162","https://openalex.org/W1999004162","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2023466863","https://openalex.org/W2696990509"],"abstract_inverted_index":{"While":[0],"recent":[1],"text-to-speech":[2],"(TTS)":[3],"systems":[4],"have":[5],"made":[6],"remarkable":[7],"strides":[8],"toward":[9],"human-level":[10],"quality,":[11],"the":[12,29,44,57,66,71,82,95,98,122,141],"performance":[13],"of":[14,20,46,59,137],"cross-lingual":[15,34,47,132],"TTS":[16],"lags":[17],"behind":[18],"that":[19,104,126],"intra-lingual":[21],"TTS.":[22,35],"This":[23],"gap":[24],"is":[25,87],"mainly":[26],"rooted":[27],"from":[28],"speaker-language":[30],"entanglement":[31],"problem":[32],"in":[33,56,131,135],"In":[36],"this":[37],"paper,":[38],"we":[39,124],"propose":[40],"CrossSpeech":[41,64,113,127],"which":[42,86],"improves":[43],"quality":[45],"speech":[48,67,102],"by":[49],"effectively":[50],"disentangling":[51],"speaker":[52,92,106,117,138],"and":[53,75,118],"language":[54,119],"information":[55,111],"level":[58],"acoustic":[60,84],"feature":[61],"space.":[62],"Specifically,":[63],"decomposes":[65],"generation":[68],"pipeline":[69],"into":[70],"speaker-independent":[72,83],"generator":[73,77],"(SIG)":[74],"speaker-dependent":[76,101],"(SDG).":[78],"The":[79],"SIG":[80],"produces":[81],"representation":[85],"not":[88],"biased":[89],"to":[90,140],"specific":[91],"distributions.":[93],"On":[94],"other":[96],"hand,":[97],"SDG":[99],"models":[100],"variation":[103],"characterizes":[105],"attributes.":[107],"By":[108],"handling":[109],"each":[110],"separately,":[112],"can":[114],"obtain":[115],"disentangled":[116],"representations.":[120],"From":[121],"experiments,":[123],"verify":[125],"achieves":[128],"significant":[129],"improvements":[130],"TTS,":[133],"especially":[134],"terms":[136],"similarity":[139],"target":[142],"speaker.":[143]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
