{"id":"https://openalex.org/W4401607735","doi":"https://doi.org/10.1109/taslp.2024.3444470","title":"Textless Unit-to-Unit Training for Many-to-Many Multilingual Speech-to-Speech Translation","display_name":"Textless Unit-to-Unit Training for Many-to-Many Multilingual Speech-to-Speech Translation","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4401607735","doi":"https://doi.org/10.1109/taslp.2024.3444470"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3444470","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3444470","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100343629","display_name":"Minsu Kim","orcid":"https://orcid.org/0000-0002-6514-0018"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Minsu Kim","raw_affiliation_strings":["Integrated Vision and Language Laboratory, School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea"],"affiliations":[{"raw_affiliation_string":"Integrated Vision and Language Laboratory, School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101725649","display_name":"Jeongsoo Choi","orcid":"https://orcid.org/0009-0005-6817-604X"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jeongsoo Choi","raw_affiliation_strings":["Integrated Vision and Language Laboratory, School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea"],"affiliations":[{"raw_affiliation_string":"Integrated Vision and Language Laboratory, School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084688455","display_name":"Dahun Kim","orcid":"https://orcid.org/0000-0003-1776-6195"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dahun Kim","raw_affiliation_strings":["Google DeepMind, Mountain View, CA, USA"],"affiliations":[{"raw_affiliation_string":"Google DeepMind, Mountain View, CA, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5038798134","display_name":"Yong Man Ro","orcid":"https://orcid.org/0000-0001-5306-6853"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Yong Man Ro","raw_affiliation_strings":["Integrated Vision and Language Laboratory, School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea"],"affiliations":[{"raw_affiliation_string":"Integrated Vision and Language Laboratory, School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea","institution_ids":["https://openalex.org/I157485424"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100343629"],"corresponding_institution_ids":["https://openalex.org/I157485424"],"apc_list":null,"apc_paid":null,"fwci":4.4747,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.95157698,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"32","issue":null,"first_page":"3934","last_page":"3946"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/unit","display_name":"Unit (ring theory)","score":0.6986176371574402},{"id":"https://openalex.org/keywords/speech-translation","display_name":"Speech translation","score":0.6516960859298706},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5131576657295227},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5099548101425171},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.489317387342453},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.44554203748703003},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.3752022087574005},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3419922888278961},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.26759955286979675},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.23178160190582275},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.0729915201663971},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.06553101539611816},{"id":"https://openalex.org/keywords/mathematics-education","display_name":"Mathematics education","score":0.046044230461120605}],"concepts":[{"id":"https://openalex.org/C122637931","wikidata":"https://www.wikidata.org/wiki/Q118084","display_name":"Unit (ring theory)","level":2,"score":0.6986176371574402},{"id":"https://openalex.org/C2780366754","wikidata":"https://www.wikidata.org/wiki/Q7494857","display_name":"Speech translation","level":3,"score":0.6516960859298706},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5131576657295227},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5099548101425171},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.489317387342453},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.44554203748703003},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.3752022087574005},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3419922888278961},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.26759955286979675},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.23178160190582275},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0729915201663971},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.06553101539611816},{"id":"https://openalex.org/C145420912","wikidata":"https://www.wikidata.org/wiki/Q853077","display_name":"Mathematics education","level":1,"score":0.046044230461120605},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C105580179","wikidata":"https://www.wikidata.org/wiki/Q188928","display_name":"Messenger RNA","level":3,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3444470","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3444470","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6700000166893005,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320322120","display_name":"National Research Foundation of Korea","ror":"https://ror.org/013aysd81"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":111,"referenced_works":["https://openalex.org/W1922655562","https://openalex.org/W1992475611","https://openalex.org/W1995562189","https://openalex.org/W2033436836","https://openalex.org/W2105482032","https://openalex.org/W2106440210","https://openalex.org/W2242221029","https://openalex.org/W2525778437","https://openalex.org/W2526425061","https://openalex.org/W2551572271","https://openalex.org/W2766219058","https://openalex.org/W2781918655","https://openalex.org/W2891205112","https://openalex.org/W2896457183","https://openalex.org/W2933138175","https://openalex.org/W2962780374","https://openalex.org/W2962784628","https://openalex.org/W2963216553","https://openalex.org/W2963250244","https://openalex.org/W2963609956","https://openalex.org/W2963979492","https://openalex.org/W2964045208","https://openalex.org/W2972473628","https://openalex.org/W2972495969","https://openalex.org/W2972802841","https://openalex.org/W3001434439","https://openalex.org/W3015698636","https://openalex.org/W3015826515","https://openalex.org/W3017474798","https://openalex.org/W3017535695","https://openalex.org/W3034999214","https://openalex.org/W3081416955","https://openalex.org/W3090474612","https://openalex.org/W3095012670","https://openalex.org/W3119308075","https://openalex.org/W3140429000","https://openalex.org/W3142316150","https://openalex.org/W3180374548","https://openalex.org/W3181257032","https://openalex.org/W3196509775","https://openalex.org/W3197324626","https://openalex.org/W3197771105","https://openalex.org/W3205193540","https://openalex.org/W3205644108","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3213029956","https://openalex.org/W3213322812","https://openalex.org/W3213544594","https://openalex.org/W4200300291","https://openalex.org/W4221153524","https://openalex.org/W4221155340","https://openalex.org/W4224319127","https://openalex.org/W4226033575","https://openalex.org/W4226399820","https://openalex.org/W4287854499","https://openalex.org/W4296070387","https://openalex.org/W4307323391","https://openalex.org/W4307680525","https://openalex.org/W4309041555","https://openalex.org/W4313679638","https://openalex.org/W4322718191","https://openalex.org/W4323651091","https://openalex.org/W4372260478","https://openalex.org/W4372349107","https://openalex.org/W4375868953","https://openalex.org/W4381786045","https://openalex.org/W4385245566","https://openalex.org/W4385570550","https://openalex.org/W4385572318","https://openalex.org/W4385573012","https://openalex.org/W4385574033","https://openalex.org/W4385764360","https://openalex.org/W4385822683","https://openalex.org/W4385822729","https://openalex.org/W4385823403","https://openalex.org/W4386071467","https://openalex.org/W4386133927","https://openalex.org/W4392909068","https://openalex.org/W4394671563","https://openalex.org/W4403635980","https://openalex.org/W6640090968","https://openalex.org/W6687566353","https://openalex.org/W6727690538","https://openalex.org/W6734815144","https://openalex.org/W6750200984","https://openalex.org/W6752888775","https://openalex.org/W6754420807","https://openalex.org/W6763832098","https://openalex.org/W6776472420","https://openalex.org/W6778083308","https://openalex.org/W6778883912","https://openalex.org/W6780218876","https://openalex.org/W6783867762","https://openalex.org/W6790220310","https://openalex.org/W6790356757","https://openalex.org/W6796464841","https://openalex.org/W6798575157","https://openalex.org/W6805710207","https://openalex.org/W6810168380","https://openalex.org/W6810259195","https://openalex.org/W6810701745","https://openalex.org/W6811129797","https://openalex.org/W6838789019","https://openalex.org/W6841035593","https://openalex.org/W6846288075","https://openalex.org/W6848735303","https://openalex.org/W6849622896","https://openalex.org/W6850334629","https://openalex.org/W6850625674","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W230091440","https://openalex.org/W2233261550","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W2997094352","https://openalex.org/W3216976533","https://openalex.org/W100620283","https://openalex.org/W4301342010","https://openalex.org/W2495260952","https://openalex.org/W123774389"],"abstract_inverted_index":{"This":[0],"paper":[1],"proposes":[2],"a":[3,49,108,209],"textless":[4,210],"training":[5,293],"method":[6,262],"for":[7,225,231,284],"many-to-many":[8,109,271],"multilingual":[9,34,232,265],"speech-to-speech":[10],"translation":[11,112],"that":[12,39,214,276,288],"can":[13,61,71,160,180,197,219,279],"also":[14,230,280],"benefit":[15],"the":[16,41,55,64,68,82,89,119,124,131,136,141,147,151,156,158,162,194,215,257,260,270,277,303],"transfer":[17],"of":[18,44,67,93,164,259],"pre-trained":[19],"knowledge":[20,163],"to":[21,102,128,145,171,174,200,269],"text-based":[22],"systems,":[23],"text-to-speech":[24,27],"synthesis":[25],"and":[26,78,91,169,187,191,236],"translation.":[28],"To":[29],"this":[30],"end,":[31],"we":[32,60,100,255,274],"represent":[33],"speech":[35,37,45,51,56,77,98,149,178],"with":[36,75],"units":[38,57,179],"are":[40,167,289],"discretized":[42],"representations":[43],"features":[46],"derived":[47],"from":[48,184],"self-supervised":[50],"model.":[52],"By":[53,86,248],"treating":[54],"as":[58,97,294],"pseudo-text,":[59],"focus":[62],"on":[63,123,140,245],"linguistic":[65],"content":[66],"speech,":[69],"which":[70,296],"be":[72,181,220],"easily":[73,182,198],"associated":[74,183],"both":[76,88,185],"text":[79,188,246],"modalities":[80],"at":[81],"phonetic":[83],"level":[84],"information.":[85],"setting":[87],"inputs":[90],"outputs":[92],"our":[94],"learning":[95],"problem":[96],"units,":[99],"propose":[101],"train":[103],"an":[104],"encoder-decoder":[105],"model":[106,159,196,218],"in":[107,150,208,302],"spoken":[110,133],"language":[111,126,143,272,282,286],"setting,":[113],"namely":[114],"Unit-to-Unit":[115],"Translation":[116,227,238],"(UTUT).":[117],"Specifically,":[118],"encoder":[120],"is":[121,138,206],"conditioned":[122,139],"source":[125],"token":[127,144],"correctly":[129],"understand":[130],"input":[132],"language,":[134],"while":[135],"decoder":[137],"target":[142,152],"generate":[146],"translated":[148],"language.":[153],"Therefore,":[154],"during":[155,292],"training,":[157,273],"build":[161],"how":[165,170],"languages":[166],"comprehended":[168],"relate":[172],"them":[173],"different":[175],"languages.":[176],"Since":[177],"audio":[186],"by":[189],"quantization":[190],"phonemization":[192],"respectively,":[193],"trained":[195,207],"transferred":[199],"text-related":[201],"tasks,":[202],"even":[203],"if":[204],"it":[205],"manner.":[211],"We":[212],"demonstrate":[213],"proposed":[216,261],"UTUT":[217,278],"effectively":[221],"utilized":[222],"not":[223,290,298],"only":[224,241],"Speech-to-Speech":[226],"(S2ST)":[228],"but":[229],"Text-to-Speech":[233,237],"Synthesis":[234],"(T2S)":[235],"(T2ST),":[239],"requiring":[240],"minimal":[242],"fine-tuning":[243],"steps":[244],"inputs.":[247],"conducting":[249],"comprehensive":[250],"experiments":[251],"encompassing":[252],"various":[253],"languages,":[254],"validate":[256],"efficacy":[258],"across":[263],"diverse":[264],"tasks.":[266],"Moreover,":[267],"thanks":[268],"show":[275],"perform":[281],"translations":[283],"novel":[285],"pairs":[287],"present":[291],"pairs,":[295],"has":[297],"well":[299],"been":[300],"explored":[301],"previous":[304],"literature.":[305]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":10}],"updated_date":"2026-04-04T08:04:53.788161","created_date":"2025-10-10T00:00:00"}
