{"id":"https://openalex.org/W4408345654","doi":"https://doi.org/10.1109/icassp49660.2025.10890391","title":"Delayed Fusion: Integrating Large Language Models into First-Pass Decoding in End-to-end Speech Recognition","display_name":"Delayed Fusion: Integrating Large Language Models into First-Pass Decoding in End-to-end Speech Recognition","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408345654","doi":"https://doi.org/10.1109/icassp49660.2025.10890391"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890391","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890391","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5087554069","display_name":"Takaaki Hori","orcid":"https://orcid.org/0000-0003-4560-8039"},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Takaaki Hori","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031960713","display_name":"Martin Kocour","orcid":"https://orcid.org/0000-0003-4924-5124"},"institutions":[{"id":"https://openalex.org/I60587646","display_name":"Brno University of Technology","ror":"https://ror.org/03613d656","country_code":"CZ","type":"education","lineage":["https://openalex.org/I60587646"]}],"countries":["CZ"],"is_corresponding":false,"raw_author_name":"Martin Kocour","raw_affiliation_strings":["Brno University of Technology"],"affiliations":[{"raw_affiliation_string":"Brno University of Technology","institution_ids":["https://openalex.org/I60587646"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050153883","display_name":"Adnan Haider","orcid":"https://orcid.org/0000-0003-2240-2708"},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Adnan Haider","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030109648","display_name":"Erik McDermott","orcid":null},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Erik McDermott","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110376025","display_name":"Xiaodan Zhuang","orcid":null},"institutions":[{"id":"https://openalex.org/I60587646","display_name":"Brno University of Technology","ror":"https://ror.org/03613d656","country_code":"CZ","type":"education","lineage":["https://openalex.org/I60587646"]}],"countries":["CZ"],"is_corresponding":false,"raw_author_name":"Xiaodan Zhuang","raw_affiliation_strings":["Brno University of Technology"],"affiliations":[{"raw_affiliation_string":"Brno University of Technology","institution_ids":["https://openalex.org/I60587646"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5087554069"],"corresponding_institution_ids":["https://openalex.org/I4210107260"],"apc_list":null,"apc_paid":null,"fwci":9.6482,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.97369785,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9704999923706055,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9704999923706055,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.7207632660865784},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.7043353319168091},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.664613664150238},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6294412612915039},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.49243029952049255},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3331071734428406},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.14268416166305542},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.08847609162330627}],"concepts":[{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.7207632660865784},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.7043353319168091},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.664613664150238},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6294412612915039},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.49243029952049255},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3331071734428406},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.14268416166305542},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.08847609162330627},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890391","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890391","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W7198584","https://openalex.org/W1710082047","https://openalex.org/W2127141656","https://openalex.org/W2138889249","https://openalex.org/W2627092829","https://openalex.org/W2739883972","https://openalex.org/W2766219058","https://openalex.org/W2886180730","https://openalex.org/W2888779557","https://openalex.org/W2963174344","https://openalex.org/W2963250244","https://openalex.org/W2963362078","https://openalex.org/W3152221657","https://openalex.org/W3197478142","https://openalex.org/W3205201903","https://openalex.org/W4221160683","https://openalex.org/W4297841367","https://openalex.org/W4385245566","https://openalex.org/W4385822949","https://openalex.org/W4388017359","https://openalex.org/W4391021666","https://openalex.org/W4391021773","https://openalex.org/W4392903704","https://openalex.org/W4392931212","https://openalex.org/W4402112329","https://openalex.org/W6638749077","https://openalex.org/W6769613987","https://openalex.org/W6810081322","https://openalex.org/W6850625674","https://openalex.org/W6853998256","https://openalex.org/W6854866820","https://openalex.org/W6858023062","https://openalex.org/W6861581687"],"related_works":["https://openalex.org/W2151749779","https://openalex.org/W3179968364","https://openalex.org/W1999612375","https://openalex.org/W2938107654","https://openalex.org/W2161474341","https://openalex.org/W4302615923","https://openalex.org/W3203142394","https://openalex.org/W2351061015","https://openalex.org/W4220731478","https://openalex.org/W4404782863"],"abstract_inverted_index":{"This":[0,111],"paper":[1],"presents":[2],"an":[3],"efficient":[4],"decoding":[5,100,141,156],"approach":[6,25],"for":[7],"end-to-end":[8],"automatic":[9],"speech":[10],"recognition":[11],"(E2E-ASR)":[12],"with":[13,38,96],"large":[14],"language":[15,28],"models":[16,29],"(LLMs).":[17],"Although":[18],"shallow":[19,162],"fusion":[20,153,163],"is":[21,43,75],"the":[22,54,58,68,72,117,123,127,168],"most":[23],"common":[24],"to":[26,66,93,161],"incorporate":[27],"into":[30],"E2E-ASR":[31],"decoding,":[32],"we":[33,64],"face":[34],"two":[35],"practical":[36],"problems":[37],"LLMs.":[39],"(1)":[40],"LLM":[41,91,124,130,145],"inference":[42,131],"computationally":[44],"costly.":[45],"(2)":[46],"There":[47],"may":[48],"be":[49],"a":[50,97],"vocabulary":[51],"mismatch":[52],"between":[53],"ASR":[55,69,94,109,138,143,170],"model":[56,70],"and":[57,79,101,144,158,164,172,180],"LLM.":[59],"To":[60],"resolve":[61],"this":[62],"mismatch,":[63],"need":[65],"retrain":[67],"and/or":[71],"LLM,":[73],"which":[74,89],"at":[76],"best":[77],"time-consuming":[78],"in":[80,108],"many":[81],"cases":[82],"not":[83,115],"feasible.":[84],"We":[85,149],"propose":[86],"delayed":[87,152],"fusion,":[88],"applies":[90],"scores":[92],"hypotheses":[95,120,139],"delay":[98],"during":[99,140],"enables":[102],"easier":[103],"use":[104],"of":[105,119,129,137],"pre-trained":[106],"LLMs":[107],"tasks.":[110],"method":[112],"can":[113],"reduce":[114],"only":[116],"number":[118,128],"scored":[121],"by":[122],"but":[125],"also":[126,134],"calls.":[132],"It":[133],"allows":[135],"re-tokenizion":[136],"if":[142],"employ":[146],"different":[147],"tokenizations.":[148],"demonstrate":[150],"that":[151],"provides":[154],"improved":[155],"speed":[157],"accuracy":[159],"compared":[160],"N-best":[165],"rescoring":[166],"using":[167],"LibriHeavy":[169],"corpus":[171],"three":[173],"public":[174],"LLMs,":[175],"OpenLLaMA":[176],"3B":[177],"&":[178],"7B":[179],"Mistral":[181],"7B.":[182]},"counts_by_year":[{"year":2025,"cited_by_count":4}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
