{"id":"https://openalex.org/W4372191700","doi":"https://doi.org/10.1109/icassp49357.2023.10095616","title":"Joint Pre-Training with Speech and Bilingual Text for Direct Speech to Speech Translation","display_name":"Joint Pre-Training with Speech and Bilingual Text for Direct Speech to Speech Translation","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372191700","doi":"https://doi.org/10.1109/icassp49357.2023.10095616"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095616","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095616","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056696580","display_name":"Kun Wei","orcid":"https://orcid.org/0000-0002-8954-3735"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Kun Wei","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xian,China","Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xian, China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xian,China","institution_ids":["https://openalex.org/I17145004"]},{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xian, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106557565","display_name":"Long Zhou","orcid":"https://orcid.org/0009-0006-1919-4943"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Long Zhou","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101577318","display_name":"Ziqiang Zhang","orcid":"https://orcid.org/0000-0003-0110-1543"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Ziqiang Zhang","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100430201","display_name":"Liping Chen","orcid":"https://orcid.org/0009-0003-6453-5645"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Liping Chen","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101635405","display_name":"Shujie Liu","orcid":"https://orcid.org/0009-0008-0785-8882"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Shujie Liu","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061724331","display_name":"Lei He","orcid":"https://orcid.org/0000-0003-3028-6305"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Lei He","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365053","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-1089-9748"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5014662947","display_name":"Furu Wei","orcid":"https://orcid.org/0000-0002-7810-5852"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Furu Wei","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5056696580"],"corresponding_institution_ids":["https://openalex.org/I17145004"],"apc_list":null,"apc_paid":null,"fwci":1.7457,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.87271771,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8558311462402344},{"id":"https://openalex.org/keywords/speech-translation","display_name":"Speech translation","score":0.6709214448928833},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6300998330116272},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5882471203804016},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5666362047195435},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5411140322685242},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.5365232825279236},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.5303148627281189},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4627174735069275},{"id":"https://openalex.org/keywords/speech-corpus","display_name":"Speech corpus","score":0.4374582767486572},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4195733964443207},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.38778156042099}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8558311462402344},{"id":"https://openalex.org/C2780366754","wikidata":"https://www.wikidata.org/wiki/Q7494857","display_name":"Speech translation","level":3,"score":0.6709214448928833},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6300998330116272},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5882471203804016},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5666362047195435},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5411140322685242},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.5365232825279236},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.5303148627281189},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4627174735069275},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.4374582767486572},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4195733964443207},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.38778156042099},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C170154142","wikidata":"https://www.wikidata.org/wiki/Q150737","display_name":"Architectural engineering","level":1,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C105580179","wikidata":"https://www.wikidata.org/wiki/Q188928","display_name":"Messenger RNA","level":3,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095616","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095616","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7699999809265137,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":40,"referenced_works":["https://openalex.org/W22168010","https://openalex.org/W1537859740","https://openalex.org/W2097203679","https://openalex.org/W2136545725","https://openalex.org/W2582956876","https://openalex.org/W2949328740","https://openalex.org/W2963532001","https://openalex.org/W2970550739","https://openalex.org/W2972495969","https://openalex.org/W3001434439","https://openalex.org/W3007068036","https://openalex.org/W3015698636","https://openalex.org/W3036601975","https://openalex.org/W3119308075","https://openalex.org/W3142316150","https://openalex.org/W3173767661","https://openalex.org/W3175871055","https://openalex.org/W3180374548","https://openalex.org/W3186843219","https://openalex.org/W3209059054","https://openalex.org/W4221153524","https://openalex.org/W4221155340","https://openalex.org/W4226444650","https://openalex.org/W4280601369","https://openalex.org/W4287854499","https://openalex.org/W4296070387","https://openalex.org/W4385245566","https://openalex.org/W4385572318","https://openalex.org/W4385573012","https://openalex.org/W4385893869","https://openalex.org/W6600880057","https://openalex.org/W6739901393","https://openalex.org/W6780218876","https://openalex.org/W6787407267","https://openalex.org/W6798080464","https://openalex.org/W6799081819","https://openalex.org/W6810259195","https://openalex.org/W6810419249","https://openalex.org/W6810701745","https://openalex.org/W6846738872"],"related_works":["https://openalex.org/W4385571610","https://openalex.org/W4200068392","https://openalex.org/W4362498905","https://openalex.org/W2338806053","https://openalex.org/W2772686614","https://openalex.org/W2293738010","https://openalex.org/W2036933852","https://openalex.org/W1489621819","https://openalex.org/W2572044271","https://openalex.org/W2152945827"],"abstract_inverted_index":{"Direct":[0],"speech-to-speech":[1,70],"translation":[2,71],"(S2ST)":[3],"is":[4,58,81],"an":[5,113],"attractive":[6],"research":[7],"topic":[8],"with":[9,61],"many":[10],"advantages":[11],"compared":[12,120],"to":[13,35,91,121],"cascaded":[14],"S2ST.":[15],"However,":[16],"direct":[17,69],"S2ST":[18],"suffers":[19],"from":[20,28,89],"the":[21,26,29,32,36,39,76,85,96,99],"data":[22,67],"scarcity":[23],"problem":[24],"because":[25],"corpora":[27],"speech":[30,37,63,87],"of":[31,38,83,98,115],"source":[33,90],"language":[34,41],"target":[40,92],"are":[42],"very":[43],"rare.":[44],"To":[45],"address":[46],"this":[47,52],"issue,":[48],"we":[49],"propose":[50],"in":[51],"paper":[53],"a":[54,127],"Speech2S":[55,80,101,111],"model,":[56],"which":[57],"jointly":[59],"pre-trained":[60],"unpaired":[62],"and":[64,104,125],"bilingual":[65],"text":[66,78],"for":[68],"tasks.":[72],"By":[73],"effectively":[74],"leveraging":[75],"paired":[77],"data,":[79],"capable":[82],"modeling":[84],"cross-lingual":[86],"conversion":[88],"language.":[93],"We":[94],"verify":[95],"performance":[97,132],"proposed":[100],"on":[102],"Europarl-ST":[103],"VoxPopuli":[105],"datasets.":[106],"Experimental":[107],"results":[108],"demonstrate":[109],"that":[110],"gets":[112],"improvement":[114],"about":[116],"5":[117],"BLEU":[118],"scores":[119],"encoder-only":[122],"pre-training":[123],"models,":[124],"achieves":[126],"competitive":[128],"or":[129],"even":[130],"better":[131],"than":[133],"existing":[134],"state-of-the-art":[135],"models":[136],"<sup":[137],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[138],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[139],".":[140]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
