{"id":"https://openalex.org/W4408352199","doi":"https://doi.org/10.1109/icassp49660.2025.10889530","title":"Transducer-Llama: Integrating LLMs into Streamable Transducer-based Speech Recognition","display_name":"Transducer-Llama: Integrating LLMs into Streamable Transducer-based Speech Recognition","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408352199","doi":"https://doi.org/10.1109/icassp49660.2025.10889530"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10889530","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889530","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5068367173","display_name":"Keqi Deng","orcid":"https://orcid.org/0000-0003-1490-963X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Keqi Deng","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103232491","display_name":"Jinxi Guo","orcid":"https://orcid.org/0000-0001-9563-7351"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinxi Guo","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103207430","display_name":"Yingyi Ma","orcid":"https://orcid.org/0000-0002-0205-0197"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yingyi Ma","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013781881","display_name":"Niko Moritz","orcid":"https://orcid.org/0000-0003-4352-8115"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niko Moritz","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002191410","display_name":"Philip C. Woodland","orcid":"https://orcid.org/0000-0001-9069-0225"},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Philip C. Woodland","raw_affiliation_strings":["University of Cambridge,Department of Engineering,UK"],"affiliations":[{"raw_affiliation_string":"University of Cambridge,Department of Engineering,UK","institution_ids":["https://openalex.org/I241749"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066166549","display_name":"Ozlem Kalinli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ozlem Kalinli","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113773386","display_name":"Mike Seltzer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mike Seltzer","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5068367173"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.5147,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.88690229,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.986299991607666,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.986299991607666,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9373000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transducer","display_name":"Transducer","score":0.9012132883071899},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4760130047798157},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4369828701019287},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.3256382346153259},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.2790250778198242},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.2005092203617096},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.12524735927581787}],"concepts":[{"id":"https://openalex.org/C56318395","wikidata":"https://www.wikidata.org/wiki/Q215928","display_name":"Transducer","level":2,"score":0.9012132883071899},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4760130047798157},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4369828701019287},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.3256382346153259},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.2790250778198242},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.2005092203617096},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.12524735927581787}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10889530","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889530","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1495239264","https://openalex.org/W2127141656","https://openalex.org/W2131342762","https://openalex.org/W2939111082","https://openalex.org/W3015686596","https://openalex.org/W3094979069","https://openalex.org/W3095410713","https://openalex.org/W3097777922","https://openalex.org/W3115075512","https://openalex.org/W3149335959","https://openalex.org/W3162665866","https://openalex.org/W3202419788","https://openalex.org/W4221151577","https://openalex.org/W4287888698","https://openalex.org/W4297727296","https://openalex.org/W4319862418","https://openalex.org/W4375869444","https://openalex.org/W4385573587","https://openalex.org/W4389524262","https://openalex.org/W4392903749","https://openalex.org/W4392903956","https://openalex.org/W4393157525","https://openalex.org/W4400033261","https://openalex.org/W4400900236","https://openalex.org/W4402111955","https://openalex.org/W4402118935","https://openalex.org/W4402671596","https://openalex.org/W4406461487","https://openalex.org/W6778883912","https://openalex.org/W6810738896","https://openalex.org/W6854866820","https://openalex.org/W6861561103","https://openalex.org/W6867545204","https://openalex.org/W6869546922"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2383068986","https://openalex.org/W2090275562","https://openalex.org/W3152005513","https://openalex.org/W2020960496","https://openalex.org/W1266100159","https://openalex.org/W2088920587","https://openalex.org/W2046074924"],"abstract_inverted_index":{"While":[0],"large":[1,48],"language":[2,63],"models":[3],"(LLMs)":[4],"have":[5],"been":[6],"applied":[7],"to":[8,73,148],"automatic":[9],"speech":[10,77],"recognition":[11],"(ASR),":[12],"the":[13,17,47,86,96,139,150,153,157,162,171],"task":[14],"of":[15,50,152],"making":[16],"model":[18,28,88],"streamable":[19],"remains":[20],"a":[21,26,35,90,105,114,120,133,177,184,189],"challenge.":[22],"This":[23],"paper":[24,66,112],"proposes":[25,113],"novel":[27],"architecture,":[29],"Transducer-Llama,":[30],"that":[31,46,83,170],"integrates":[32],"LLMs":[33,51,75],"into":[34],"Factorized":[36],"Transducer":[37],"(FT)":[38],"model,":[39],"naturally":[40],"enabling":[41],"streaming":[42,173],"capabilities.":[43],"Furthermore,":[44],"given":[45],"vocabulary":[49,70],"can":[52],"cause":[53],"data":[54],"sparsity":[55],"issue":[56],"and":[57,128,164,188],"increased":[58],"training":[59,127],"costs":[60],"for":[61],"spoken":[62],"systems,":[64],"this":[65,111],"introduces":[67],"an":[68,193],"efficient":[69],"adaptation":[71],"technique":[72],"align":[74],"with":[76,89,132,156],"system":[78],"vocabularies.":[79],"The":[80],"results":[81],"show":[82,169],"directly":[84],"optimizing":[85],"FT":[87,186],"strong":[91,134,185],"pre-trained":[92,107],"LLM-based":[93],"predictor":[94,123,155],"using":[95,119],"RNN-T":[97,125,194],"loss":[98,126,145],"yields":[99],"some":[100],"but":[101],"limited":[102],"improvements":[103],"over":[104,183,192],"smaller":[106],"LM":[108,116,122,137],"predictor.":[109],"Therefore,":[110],"weak-to-strong":[115],"swap":[117],"strategy,":[118],"weak":[121],"during":[124],"then":[129],"replacing":[130],"it":[131],"LLM.":[135],"After":[136],"replacement,":[138],"minimum":[140],"word":[141],"error":[142],"rate":[143],"(MWER)":[144],"is":[146],"employed":[147],"finetune":[149],"integration":[151],"LLM":[154],"Transducer-Llama":[158,174],"model.":[159],"Experiments":[160],"on":[161],"LibriSpeech":[163,167],"large-scale":[165],"multi-lingual":[166],"corpora":[168],"proposed":[172],"approach":[175],"gave":[176],"17%":[178],"relative":[179],"WER":[180],"reduction":[181],"(WERR)":[182],"baseline":[187],"32%":[190],"WERR":[191],"baseline.":[195]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-27T14:29:43.386196","created_date":"2025-10-10T00:00:00"}
