{"id":"https://openalex.org/W7148362168","doi":"https://doi.org/10.1109/asru65441.2025.11434639","title":"SLM-S2ST: A multimodal language model for direct speech-to-speech translation","display_name":"SLM-S2ST: A multimodal language model for direct speech-to-speech translation","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148362168","doi":"https://doi.org/10.1109/asru65441.2025.11434639"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434639","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434639","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yuxuan Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yuxuan Hu","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Haibin Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haibin Wu","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ruchao Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ruchao Fan","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xiaofei Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaofei Wang","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Heng Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heng Lu","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yao Qian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao Qian","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Jinyu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.87608048,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5648999810218811,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5648999810218811,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.2037999927997589,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.031099999323487282,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.682200014591217},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.6164000034332275},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.428600013256073},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.41850000619888306},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.41110000014305115},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4041000008583069},{"id":"https://openalex.org/keywords/speech-translation","display_name":"Speech translation","score":0.40310001373291016},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.3968000113964081}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8112999796867371},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.682200014591217},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.6164000034332275},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5957000255584717},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5892000198364258},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.51910001039505},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.428600013256073},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.41850000619888306},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.41110000014305115},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4041000008583069},{"id":"https://openalex.org/C2780366754","wikidata":"https://www.wikidata.org/wiki/Q7494857","display_name":"Speech translation","level":3,"score":0.40310001373291016},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.3968000113964081},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3644999861717224},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.36000001430511475},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.32519999146461487},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.3246000111103058},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.32420000433921814},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3138999938964844},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.2946999967098236},{"id":"https://openalex.org/C622187","wikidata":"https://www.wikidata.org/wiki/Q3500773","display_name":"BLEU","level":3,"score":0.27950000762939453},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.27630001306533813},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.27000001072883606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434639","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434639","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.48972928524017334}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W3097777922","https://openalex.org/W3180374548","https://openalex.org/W3196509775","https://openalex.org/W3198217962","https://openalex.org/W4319862635","https://openalex.org/W4385570550","https://openalex.org/W4402671605","https://openalex.org/W7133231661"],"related_works":[],"abstract_inverted_index":{"Speech-aware":[0],"language":[1,10],"models":[2,95],"(LMs)":[3],"have":[4],"demonstrated":[5],"capabilities":[6],"in":[7],"understanding":[8],"spoken":[9],"while":[11],"generating":[12,53],"text-based":[13],"responses.":[14],"However,":[15],"enabling":[16],"them":[17],"to":[18,69],"produce":[19],"speech":[20,55],"output":[21],"efficiently":[22],"and":[23,109],"effectively":[24],"remains":[25],"a":[26,34,66,74],"challenge.":[27],"In":[28],"this":[29],"paper,":[30],"we":[31,103],"present":[32],"SLM-S2ST,":[33],"multimodal":[35],"LM":[36],"for":[37,77],"direct":[38],"speech-to-speech":[39],"translation":[40],"(S2ST),":[41],"built":[42],"on":[43,83,97],"the":[44,84,98,106,110,118],"opensource":[45],"Phi4-MM":[46],"model.":[47,121],"SLM-S2ST":[48,113],"extends":[49],"its":[50],"predecessor":[51],"by":[52,73],"translated":[54],"using":[56],"an":[57],"audio":[58,63],"transformer":[59],"head":[60],"that":[61],"predicts":[62],"tokens":[64],"with":[65,117],"delay":[67],"relative":[68],"text":[70],"tokens,":[71],"followed":[72],"streaming":[75],"vocoder":[76],"waveform":[78],"synthesis.":[79],"Our":[80],"experimental":[81],"results":[82],"CVSS-C":[85],"dataset":[86],"demonstrate":[87],"SLMS2ST\u2019s":[88],"superior":[89],"performance,":[90],"significantly":[91],"surpassing":[92],"existing":[93],"baseline":[94],"trained":[96],"same":[99],"dataset.":[100],"Furthermore,":[101],"when":[102],"scale":[104],"up":[105],"training":[107],"data":[108],"model":[111],"size,":[112],"reaches":[114],"on-par":[115],"performance":[116],"current":[119],"SOTA":[120]},"counts_by_year":[],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2026-02-14T00:00:00"}
