{"id":"https://openalex.org/W4404035088","doi":"https://doi.org/10.1109/lsp.2024.3491019","title":"Towards Maximum Likelihood Training for Transducer-Based Streaming Speech Recognition","display_name":"Towards Maximum Likelihood Training for Transducer-Based Streaming Speech Recognition","publication_year":2024,"publication_date":"2024-11-04","ids":{"openalex":"https://openalex.org/W4404035088","doi":"https://doi.org/10.1109/lsp.2024.3491019"},"language":"en","primary_location":{"id":"doi:10.1109/lsp.2024.3491019","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2024.3491019","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2411.17537","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103052875","display_name":"Hyeonseung Lee","orcid":"https://orcid.org/0000-0001-6997-205X"},"institutions":[{"id":"https://openalex.org/I124633538","display_name":"University of Seoul","ror":"https://ror.org/05en5nh73","country_code":"KR","type":"education","lineage":["https://openalex.org/I124633538"]},{"id":"https://openalex.org/I4210089444","display_name":"GS Caltex (South Korea)","ror":"https://ror.org/00bvkj141","country_code":"KR","type":"company","lineage":["https://openalex.org/I4210089444"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Hyeonseung Lee","raw_affiliation_strings":["XL8 Inc., Seoul, Republic of Korea","XL8 Inc., Seoul, South Korea"],"raw_orcid":"https://orcid.org/0000-0001-6997-205X","affiliations":[{"raw_affiliation_string":"XL8 Inc., Seoul, Republic of Korea","institution_ids":["https://openalex.org/I124633538"]},{"raw_affiliation_string":"XL8 Inc., Seoul, South Korea","institution_ids":["https://openalex.org/I4210089444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016783920","display_name":"Ji Won Yoon","orcid":"https://orcid.org/0000-0001-8631-4489"},"institutions":[{"id":"https://openalex.org/I67900169","display_name":"Chung-Ang University","ror":"https://ror.org/01r024a98","country_code":"KR","type":"education","lineage":["https://openalex.org/I67900169"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Ji Won Yoon","raw_affiliation_strings":["Department of Artificial Intelligence, Chung-Ang University, Seoul, Republic of Korea","Department of Artificial Intelligence, Chung-Ang University, Seoul, South Korea"],"raw_orcid":"https://orcid.org/0000-0001-8631-4489","affiliations":[{"raw_affiliation_string":"Department of Artificial Intelligence, Chung-Ang University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I67900169"]},{"raw_affiliation_string":"Department of Artificial Intelligence, Chung-Ang University, Seoul, South Korea","institution_ids":["https://openalex.org/I67900169"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Sungsoo Kim","orcid":"https://orcid.org/0009-0006-5062-7209"},"institutions":[{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"education","lineage":["https://openalex.org/I139264467"]},{"id":"https://openalex.org/I4210164379","display_name":"Seoul Media Institute of Technology","ror":"https://ror.org/04ywg4h07","country_code":"KR","type":"education","lineage":["https://openalex.org/I4210164379"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Sungsoo Kim","raw_affiliation_strings":["Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea","Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea"],"raw_orcid":"https://orcid.org/0009-0006-5062-7209","affiliations":[{"raw_affiliation_string":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I4210164379","https://openalex.org/I139264467"]},{"raw_affiliation_string":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea","institution_ids":["https://openalex.org/I4210164379","https://openalex.org/I139264467"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5051356511","display_name":"Nam Soo Kim","orcid":"https://orcid.org/0000-0002-0568-4902"},"institutions":[{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"education","lineage":["https://openalex.org/I139264467"]},{"id":"https://openalex.org/I4210164379","display_name":"Seoul Media Institute of Technology","ror":"https://ror.org/04ywg4h07","country_code":"KR","type":"education","lineage":["https://openalex.org/I4210164379"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Nam Soo Kim","raw_affiliation_strings":["Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea","Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea"],"raw_orcid":"https://orcid.org/0000-0002-0568-4902","affiliations":[{"raw_affiliation_string":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I4210164379","https://openalex.org/I139264467"]},{"raw_affiliation_string":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea","institution_ids":["https://openalex.org/I4210164379","https://openalex.org/I139264467"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5103052875"],"corresponding_institution_ids":["https://openalex.org/I124633538","https://openalex.org/I4210089444"],"apc_list":null,"apc_paid":null,"fwci":0.3311,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.67392548,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"32","issue":null,"first_page":"26","last_page":"30"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9919000267982483,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9919000267982483,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9803000092506409,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9280999898910522,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6840179562568665},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6803657412528992},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6075958609580994},{"id":"https://openalex.org/keywords/transducer","display_name":"Transducer","score":0.5173436999320984},{"id":"https://openalex.org/keywords/maximum-likelihood","display_name":"Maximum likelihood","score":0.48034417629241943},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3968753218650818},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3916758596897125},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.23099178075790405},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.15494030714035034},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.13779601454734802}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6840179562568665},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6803657412528992},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6075958609580994},{"id":"https://openalex.org/C56318395","wikidata":"https://www.wikidata.org/wiki/Q215928","display_name":"Transducer","level":2,"score":0.5173436999320984},{"id":"https://openalex.org/C49781872","wikidata":"https://www.wikidata.org/wiki/Q1045555","display_name":"Maximum likelihood","level":2,"score":0.48034417629241943},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3968753218650818},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3916758596897125},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.23099178075790405},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.15494030714035034},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.13779601454734802},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/lsp.2024.3491019","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2024.3491019","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2411.17537","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.17537","pdf_url":"https://arxiv.org/pdf/2411.17537","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2411.17537","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.17537","pdf_url":"https://arxiv.org/pdf/2411.17537","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.41999998688697815,"display_name":"Gender equality","id":"https://metadata.un.org/sdg/5"}],"awards":[{"id":"https://openalex.org/G3830453344","display_name":null,"funder_award_id":"RS2023-00235082","funder_id":"https://openalex.org/F4320332128","funder_display_name":"Commercializations Promotion Agency for R and D Outcomes"},{"id":"https://openalex.org/G4149916161","display_name":null,"funder_award_id":"2021-0-01341","funder_id":"https://openalex.org/F4320328359","funder_display_name":"Ministry of Science and ICT, South Korea"},{"id":"https://openalex.org/G4447557280","display_name":null,"funder_award_id":"2021-0-01341","funder_id":"https://openalex.org/F4320335489","funder_display_name":"Institute for Information and Communications Technology Promotion"},{"id":"https://openalex.org/G4860259677","display_name":null,"funder_award_id":"2021-0-01341","funder_id":"https://openalex.org/F4320321202","funder_display_name":"Chung-Ang University"}],"funders":[{"id":"https://openalex.org/F4320321202","display_name":"Chung-Ang University","ror":"https://ror.org/01r024a98"},{"id":"https://openalex.org/F4320328359","display_name":"Ministry of Science and ICT, South Korea","ror":"https://ror.org/01wpjm123"},{"id":"https://openalex.org/F4320332128","display_name":"Commercializations Promotion Agency for R and D Outcomes","ror":null},{"id":"https://openalex.org/F4320335489","display_name":"Institute for Information and Communications Technology Promotion","ror":"https://ror.org/01g0hqq23"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4404035088.pdf","grobid_xml":"https://content.openalex.org/works/W4404035088.grobid-xml"},"referenced_works_count":26,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2144499799","https://openalex.org/W2160815625","https://openalex.org/W2327501763","https://openalex.org/W2799473636","https://openalex.org/W2936774411","https://openalex.org/W2939297570","https://openalex.org/W2962784628","https://openalex.org/W2962826786","https://openalex.org/W2972943112","https://openalex.org/W3006752097","https://openalex.org/W3015470971","https://openalex.org/W3015686596","https://openalex.org/W3016167541","https://openalex.org/W3160766462","https://openalex.org/W3163203022","https://openalex.org/W6601894380","https://openalex.org/W6610566761","https://openalex.org/W6639317949","https://openalex.org/W6774835902","https://openalex.org/W6790121257","https://openalex.org/W6839217812","https://openalex.org/W6850218400","https://openalex.org/W6852909395","https://openalex.org/W6857062747"],"related_works":["https://openalex.org/W2012283803","https://openalex.org/W4384820447","https://openalex.org/W2072454424","https://openalex.org/W2117438306","https://openalex.org/W2185942010","https://openalex.org/W2260725127","https://openalex.org/W230091440","https://openalex.org/W2004297762","https://openalex.org/W1992056405","https://openalex.org/W1966826629"],"abstract_inverted_index":{"Transducer":[0],"neural":[1],"networks":[2],"have":[3],"emerged":[4],"as":[5,93],"the":[6,24,34,55,71,74,78,98,104,113,116],"mainstream":[7],"approach":[8,44],"for":[9],"streaming":[10,27,117],"automatic":[11],"speech":[12],"recognition":[13],"(ASR),":[14],"offering":[15],"state-of-the-art":[16],"performance":[17],"in":[18,54],"balancing":[19],"accuracy":[20,114],"and":[21,51,60,77],"latency.":[22],"In":[23],"conventional":[25],"framework,":[26],"transducer":[28],"models":[29],"are":[30],"trained":[31],"to":[32,46,96],"maximize":[33],"likelihood":[35,59,76],"function":[36],"based":[37],"on":[38,103],"non-streaming":[39],"recursion":[40],"rules.":[41],"However,":[42],"this":[43],"leads":[45],"a":[47,67,94],"mismatch":[48],"between":[49,73],"training":[50,111],"inference,":[52],"resulting":[53],"issue":[56],"of":[57,70,115],"deformed":[58,79],"consequently":[61],"suboptimal":[62],"ASR":[63],"accuracy.":[64],"We":[65,87],"introduce":[66],"mathematical":[68],"quantification":[69],"gap":[72],"actual":[75],"likelihood,":[80],"namely":[81],"forward":[82],"variable":[83],"causal":[84],"compensation":[85],"(FoCC).":[86],"also":[88],"present":[89],"its":[90],"estimator,":[91],"FoCCE,":[92],"solution":[95],"estimate":[97],"exact":[99],"likelihood.":[100],"Through":[101],"experiments":[102],"LibriSpeech":[105],"dataset,":[106],"we":[107],"show":[108],"that":[109],"FoCCE":[110],"improves":[112],"transducers.":[118]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2024-11-05T00:00:00"}
