{"id":"https://openalex.org/W4372260518","doi":"https://doi.org/10.1109/icassp49357.2023.10095651","title":"Improving fast-slow Encoder based Transducer with Streaming Deliberation","display_name":"Improving fast-slow Encoder based Transducer with Streaming Deliberation","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372260518","doi":"https://doi.org/10.1109/icassp49357.2023.10095651"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095651","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095651","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100343450","display_name":"Ke Li","orcid":"https://orcid.org/0000-0001-7200-4244"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ke Li","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074237839","display_name":"Jay Mahadeokar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jay Mahadeokar","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103232491","display_name":"Jinxi Guo","orcid":"https://orcid.org/0000-0001-9563-7351"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinxi Guo","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103247973","display_name":"Yangyang Shi","orcid":"https://orcid.org/0000-0001-5297-4155"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yangyang Shi","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048538280","display_name":"Gil Keren","orcid":"https://orcid.org/0000-0002-5153-3494"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gil Keren","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066166549","display_name":"Ozlem Kalinli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ozlem Kalinli","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041313589","display_name":"Michael L. Seltzer","orcid":"https://orcid.org/0000-0003-3474-2451"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Michael L. Seltzer","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085310066","display_name":"Manh Duc Le","orcid":"https://orcid.org/0000-0003-3012-6053"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Duc Le","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5100343450"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.9465,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.73632026,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/deliberation","display_name":"Deliberation","score":0.8955894708633423},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.869404137134552},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.723596453666687},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6694546937942505},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5624959468841553},{"id":"https://openalex.org/keywords/transducer","display_name":"Transducer","score":0.5468848943710327},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.49939918518066406},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4955679178237915},{"id":"https://openalex.org/keywords/low-latency","display_name":"Low latency (capital markets)","score":0.41500306129455566},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3648856282234192},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.27162492275238037},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.2276628017425537},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.13373705744743347},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.11812078952789307},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.0692891776561737}],"concepts":[{"id":"https://openalex.org/C2776946740","wikidata":"https://www.wikidata.org/wiki/Q358652","display_name":"Deliberation","level":3,"score":0.8955894708633423},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.869404137134552},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.723596453666687},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6694546937942505},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5624959468841553},{"id":"https://openalex.org/C56318395","wikidata":"https://www.wikidata.org/wiki/Q215928","display_name":"Transducer","level":2,"score":0.5468848943710327},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.49939918518066406},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4955679178237915},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.41500306129455566},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3648856282234192},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27162492275238037},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.2276628017425537},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.13373705744743347},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.11812078952789307},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0692891776561737},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095651","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095651","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4099999964237213,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1828163288","https://openalex.org/W2102113734","https://openalex.org/W2193413348","https://openalex.org/W2327501763","https://openalex.org/W2766219058","https://openalex.org/W2916997151","https://openalex.org/W2936774411","https://openalex.org/W2962760690","https://openalex.org/W2963414781","https://openalex.org/W2973122799","https://openalex.org/W3007905516","https://openalex.org/W3011339933","https://openalex.org/W3015671919","https://openalex.org/W3095311338","https://openalex.org/W3097777922","https://openalex.org/W3149509723","https://openalex.org/W3160766462","https://openalex.org/W3162665866","https://openalex.org/W3163203022","https://openalex.org/W4224917454","https://openalex.org/W4224919446","https://openalex.org/W4283745191","https://openalex.org/W4297841343","https://openalex.org/W6638749077","https://openalex.org/W6675365184","https://openalex.org/W6687566353","https://openalex.org/W6747398299"],"related_works":["https://openalex.org/W4384627096","https://openalex.org/W2753773138","https://openalex.org/W3204354834","https://openalex.org/W6292469","https://openalex.org/W2223796429","https://openalex.org/W4283822356","https://openalex.org/W1950940422","https://openalex.org/W2129146436","https://openalex.org/W2032507829","https://openalex.org/W2147282173"],"abstract_inverted_index":{"This":[0],"paper":[1],"introduces":[2],"a":[3,36,97,131],"fast-slow":[4,25,67,145,173],"encoder":[5,26,51,68,146,174],"based":[6,27,69,147,175],"transducer":[7,28,70,176],"with":[8,76,130,144,163,172,191],"streaming":[9,37,49,89],"deliberation":[10,38,42,78,83,95,150,169],"for":[11,66,111],"end-to-end":[12],"automatic":[13],"speech":[14],"recognition.":[15],"We":[16,59,104],"aim":[17],"to":[18,55,71,87,128],"improve":[19,93],"the":[20,24,41,48,61,77,82,94,167],"recognition":[21,57],"accuracy":[22],"of":[23],"while":[29],"keeping":[30],"its":[31],"latency":[32,142],"low":[33],"by":[34,156],"integrating":[35],"model.":[39,79],"Specifically,":[40],"model":[43,84,135,151,170],"leverages":[44],"partial":[45,113],"hypotheses":[46],"from":[47,126],"fast":[50],"and":[52,74,108,118,137,183],"implicitly":[53],"learns":[54],"correct":[56],"errors.":[58],"modify":[60],"parallel":[62],"beam":[63],"search":[64],"algorithm":[65],"be":[72],"efficient":[73],"compatible":[75],"In":[80],"addition,":[81],"is":[85,102],"designed":[86],"process":[88],"data.":[90,161],"To":[91],"further":[92],"performance,":[96],"simple":[98],"text":[99],"augmentation":[100],"approach":[101],"explored.":[103],"also":[105,152],"compare":[106],"LSTM":[107],"Conformer":[109],"models":[110],"encoding":[112],"hypotheses.":[114],"Experiments":[115],"on":[116,158,181,188],"Librispeech":[117,182],"in-house":[119,160,189],"data":[120,190],"show":[121],"relative":[122,178,185],"WER":[123],"reductions":[124],"(WERRs)":[125],"3%":[127],"5%":[129],"slight":[132],"increase":[133],"in":[134],"size":[136],"negligible":[138],"extra":[139],"token":[140],"emission":[141,193],"compared":[143],"transducer.":[148],"The":[149],"reduces":[153],"rare":[154],"WERs":[155],"3-7%":[157],"large-scale":[159],"Compared":[162],"vanilla":[164],"neural":[165],"transducers,":[166],"proposed":[168],"together":[171],"obtains":[177],"10-11%":[179],"WERRs":[180],"around":[184],"6%":[186],"WERR":[187],"smaller":[192],"delays.":[194]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1}],"updated_date":"2026-05-09T13:55:54.758798","created_date":"2025-10-10T00:00:00"}
