{"id":"https://openalex.org/W4408345835","doi":"https://doi.org/10.1109/icassp49660.2025.10890853","title":"Efficient Streaming LLM for Speech Recognition","display_name":"Efficient Streaming LLM for Speech Recognition","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408345835","doi":"https://doi.org/10.1109/icassp49660.2025.10890853"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890853","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890853","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113970008","display_name":"Junteng Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Junteng Jia","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048538280","display_name":"Gil Keren","orcid":"https://orcid.org/0000-0002-5153-3494"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gil Keren","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026526428","display_name":"Wei Zhou","orcid":"https://orcid.org/0000-0003-3622-3970"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei Zhou","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045428440","display_name":"Egor Lakomkin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Egor Lakomkin","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100400244","display_name":"Xiaohui Zhang","orcid":"https://orcid.org/0000-0002-9075-0336"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaohui Zhang","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101579713","display_name":"Chunyang Wu","orcid":"https://orcid.org/0000-0002-0269-3555"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chunyang Wu","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072932051","display_name":"Frank Seide","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Frank Seide","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074237839","display_name":"Jay Mahadeokar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jay Mahadeokar","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066166549","display_name":"Ozlem Kalinli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ozlem Kalinli","raw_affiliation_strings":["Meta AI,USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5113970008"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.04015527,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9251999855041504,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9251999855041504,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13717","display_name":"Advanced Algorithms and Applications","score":0.9241999983787537,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.765789270401001},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7045996189117432}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.765789270401001},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7045996189117432}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890853","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890853","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2936774411","https://openalex.org/W2964110616","https://openalex.org/W3008525923","https://openalex.org/W3015190365","https://openalex.org/W3048407879","https://openalex.org/W3148654612","https://openalex.org/W4319862683","https://openalex.org/W4385823346","https://openalex.org/W4389524500","https://openalex.org/W4390041933","https://openalex.org/W4391021666","https://openalex.org/W4392903288","https://openalex.org/W4392903330","https://openalex.org/W4392903956","https://openalex.org/W4392904805","https://openalex.org/W4401024755","https://openalex.org/W4401042284","https://openalex.org/W4402111955","https://openalex.org/W4402118935","https://openalex.org/W6747158283","https://openalex.org/W6847363464","https://openalex.org/W6850477478","https://openalex.org/W6850503672","https://openalex.org/W6851950068","https://openalex.org/W6852909395","https://openalex.org/W6853998256","https://openalex.org/W6857690716","https://openalex.org/W6858161441","https://openalex.org/W6860329836","https://openalex.org/W6864586866","https://openalex.org/W6864737356"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Recent":[0],"works":[1],"have":[2],"shown":[3],"that":[4],"prompting":[5],"large":[6],"language":[7],"models":[8],"with":[9,120],"audio":[10,30,41,92],"encodings":[11],"can":[12],"unlock":[13],"speech":[14,71],"recognition":[15],"capabilities.":[16],"However,":[17],"existing":[18],"techniques":[19],"do":[20,35],"not":[21,33],"scale":[22],"efficiently,":[23],"especially":[24],"while":[25],"handling":[26],"long":[27,139],"form":[28,140],"streaming":[29,70],"inputs":[31],"\u2014":[32],"only":[34],"they":[36,47],"extrapolate":[37],"poorly":[38],"beyond":[39],"the":[40,54,87,104,145],"length":[42],"seen":[43],"during":[44],"training,":[45,103],"but":[46],"are":[48,94],"also":[49],"computationally":[50],"inefficient":[51],"due":[52],"to":[53],"quadratic":[55],"cost":[56],"of":[57],"attention.In":[58],"this":[59],"work,":[60],"we":[61],"introduce":[62],"SpeechLLM-XL,":[63],"a":[64,111],"linear":[65],"scaling":[66],"decoder-only":[67],"model":[68],"for":[69,83,90],"recognition.":[72],"We":[73],"process":[74],"audios":[75],"in":[76],"configurable":[77],"chunks":[78],"using":[79,110],"limited":[80],"attention":[81],"window":[82],"reduced":[84],"computation,":[85],"and":[86,132],"text":[88],"tokens":[89],"each":[91],"chunk":[93,123],"generated":[95],"auto-regressively":[96],"until":[97],"an":[98],"EOS":[99],"is":[100,106],"predicted.":[101],"During":[102],"transcript":[105],"segmented":[107],"into":[108],"chunks,":[109],"CTC":[112],"forced":[113],"alignment":[114],"estimated":[115],"from":[116],"encoder":[117],"output.":[118],"SpeechLLM-XL":[119],"1.28":[121],"seconds":[122],"size":[124],"achieves":[125],"2.7%/6.7%":[126],"WER":[127],"on":[128,138],"LibriSpeech":[129],"test":[130],"clean/other,":[131],"it":[133],"shows":[134],"no":[135],"quality":[136],"degradation":[137],"utterances":[141],"10x":[142],"longer":[143],"than":[144],"training":[146],"utterances.":[147]},"counts_by_year":[],"updated_date":"2025-12-21T23:12:01.093139","created_date":"2025-10-10T00:00:00"}
