{"id":"https://openalex.org/W4226543485","doi":"https://doi.org/10.48550/arxiv.2204.02967","title":"Enhanced Direct Speech-to-Speech Translation Using Self-supervised Pre-training and Data Augmentation","display_name":"Enhanced Direct Speech-to-Speech Translation Using Self-supervised Pre-training and Data Augmentation","publication_year":2022,"publication_date":"2022-04-06","ids":{"openalex":"https://openalex.org/W4226543485","doi":"https://doi.org/10.48550/arxiv.2204.02967"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2204.02967","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2204.02967","pdf_url":"https://arxiv.org/pdf/2204.02967","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2204.02967","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059951425","display_name":"Sravya Popuri","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Popuri, Sravya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072019513","display_name":"Peng\u2010Jen Chen","orcid":"https://orcid.org/0000-0001-5400-905X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Peng-Jen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087491225","display_name":"Changhan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Changhan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058915697","display_name":"Juan Pino","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pino, Juan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005191803","display_name":"Yossi Adi","orcid":"https://orcid.org/0000-0003-2237-3898"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Adi, Yossi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112542984","display_name":"Jiatao Gu","orcid":"https://orcid.org/0000-0003-3578-2711"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Jiatao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051950818","display_name":"Wei-Ning Hsu","orcid":"https://orcid.org/0000-0001-5546-5217"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hsu, Wei-Ning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5084509956","display_name":"Ann Lee","orcid":"https://orcid.org/0000-0003-4300-9032"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Ann","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5059951425"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9840999841690063,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.839841365814209},{"id":"https://openalex.org/keywords/speech-translation","display_name":"Speech translation","score":0.7757186889648438},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.7120740413665771},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6436983346939087},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.5716745853424072},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.561347484588623},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5500142574310303},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5275903344154358},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5142233371734619},{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.4382934868335724},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.43469586968421936}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.839841365814209},{"id":"https://openalex.org/C2780366754","wikidata":"https://www.wikidata.org/wiki/Q7494857","display_name":"Speech translation","level":3,"score":0.7757186889648438},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.7120740413665771},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6436983346939087},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.5716745853424072},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.561347484588623},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5500142574310303},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5275903344154358},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5142233371734619},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.4382934868335724},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.43469586968421936},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C105580179","wikidata":"https://www.wikidata.org/wiki/Q188928","display_name":"Messenger RNA","level":3,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2204.02967","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2204.02967","pdf_url":"https://arxiv.org/pdf/2204.02967","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},{"id":"doi:10.48550/arxiv.2204.02967","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2204.02967","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2204.02967","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2204.02967","pdf_url":"https://arxiv.org/pdf/2204.02967","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.699999988079071,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4226543485.pdf","grobid_xml":"https://content.openalex.org/works/W4226543485.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2338806053","https://openalex.org/W4385571610","https://openalex.org/W2972060578","https://openalex.org/W4285877427","https://openalex.org/W783305165","https://openalex.org/W2293738010","https://openalex.org/W2130553454","https://openalex.org/W3022007134","https://openalex.org/W4360995948","https://openalex.org/W4317548404"],"abstract_inverted_index":{"Direct":[0],"speech-to-speech":[1],"translation":[2,36,68,91,111],"(S2ST)":[3],"models":[4],"suffer":[5],"from":[6],"data":[7,22,52,54,137],"scarcity":[8],"issues":[9],"as":[10],"there":[11],"exists":[12],"little":[13],"parallel":[14],"S2ST":[15],"data,":[16],"compared":[17,120],"to":[18,56,93,143],"the":[19,94],"amount":[20],"of":[21,30,63],"available":[23,152],"for":[24,89],"conventional":[25],"cascaded":[26],"systems":[27],"that":[28,71,86,113,140],"consist":[29],"automatic":[31],"speech":[32,51,74,100],"recognition":[33],"(ASR),":[34],"machine":[35],"(MT),":[37],"and":[38,53,78,81,102,130],"text-to-speech":[39],"(TTS)":[40],"synthesis.":[41],"In":[42],"this":[43,58],"work,":[44],"we":[45],"explore":[46],"self-supervised":[47,114],"pre-training":[48,80,115],"with":[49,121,124,136],"unlabeled":[50],"augmentation":[55,138],"tackle":[57],"issue.":[59],"We":[60],"take":[61],"advantage":[62],"a":[64],"recently":[65],"proposed":[66],"speech-to-unit":[67],"(S2UT)":[69],"framework":[70],"encodes":[72],"target":[73],"into":[75],"discrete":[76,103],"representations,":[77],"transfer":[79],"efficient":[82],"partial":[83],"finetuning":[84],"techniques":[85,139],"work":[87],"well":[88],"speech-to-text":[90],"(S2T)":[92],"S2UT":[95],"domain":[96],"by":[97],"studying":[98],"both":[99],"encoder":[101],"unit":[104],"decoder":[105],"pre-training.":[106],"Our":[107],"experiments":[108],"on":[109],"Spanish-English":[110],"show":[112],"consistently":[116],"improves":[117],"model":[118],"performance":[119],"multitask":[122],"learning":[123],"an":[125],"average":[126],"6.6-12.1":[127],"BLEU":[128],"gain,":[129],"it":[131],"can":[132],"be":[133],"further":[134],"combined":[135],"apply":[141],"MT":[142],"create":[144],"weakly":[145],"supervised":[146],"training":[147],"data.":[148],"Audio":[149],"samples":[150],"are":[151],"at:":[153],"https://facebookresearch.github.io/speech_translation/enhanced_direct_s2st_units/index.html":[154],".":[155]},"counts_by_year":[{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
