{"id":"https://openalex.org/W3199016780","doi":"https://doi.org/10.1109/ijcnn52387.2021.9533348","title":"End-to-end speech recognition with Alignment RNN-Transducer","display_name":"End-to-end speech recognition with Alignment RNN-Transducer","publication_year":2021,"publication_date":"2021-07-18","ids":{"openalex":"https://openalex.org/W3199016780","doi":"https://doi.org/10.1109/ijcnn52387.2021.9533348","mag":"3199016780"},"language":"en","primary_location":{"id":"doi:10.1109/ijcnn52387.2021.9533348","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn52387.2021.9533348","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5074244244","display_name":"Yingli Tian","orcid":"https://orcid.org/0000-0003-4458-360X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ying Tian","raw_affiliation_strings":["Research &#x0026; Development Center Toshiba(China) Co., Ltd,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Research &#x0026; Development Center Toshiba(China) Co., Ltd,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101654599","display_name":"Zerui Li","orcid":"https://orcid.org/0000-0002-0053-4227"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zerui Li","raw_affiliation_strings":["School of Electronic Engineering Beijing University of Posts and Telecommunications,Beijing,China","School of Electronic Engineering Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic Engineering Beijing University of Posts and Telecommunications,Beijing,China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Electronic Engineering Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100343919","display_name":"Min Liu","orcid":"https://orcid.org/0000-0002-8902-5460"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Min Liu","raw_affiliation_strings":["Research &#x0026; Development Center Toshiba(China) Co., Ltd,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Research &#x0026; Development Center Toshiba(China) Co., Ltd,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017058806","display_name":"Kazushige Ouchi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kazushige Ouchi","raw_affiliation_strings":["Research &#x0026; Development Center Toshiba(China) Co., Ltd,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Research &#x0026; Development Center Toshiba(China) Co., Ltd,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108745939","display_name":"Yan Long","orcid":"https://orcid.org/0009-0006-4579-5857"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Long Yan","raw_affiliation_strings":["Research &#x0026; Development Center Toshiba(China) Co., Ltd,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Research &#x0026; Development Center Toshiba(China) Co., Ltd,Beijing,China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101830445","display_name":"Dan Zhao","orcid":"https://orcid.org/0000-0002-5946-3961"},"institutions":[{"id":"https://openalex.org/I96852419","display_name":"Capital Normal University","ror":"https://ror.org/005edt527","country_code":"CN","type":"education","lineage":["https://openalex.org/I96852419"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dan Zhao","raw_affiliation_strings":["College of Information Engineering Capital Normal University,Beijing,China","College of Information Engineering Capital Normal University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"College of Information Engineering Capital Normal University,Beijing,China","institution_ids":["https://openalex.org/I96852419"]},{"raw_affiliation_string":"College of Information Engineering Capital Normal University, Beijing, China","institution_ids":["https://openalex.org/I96852419"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5074244244"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.136,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.54963363,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"32","issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/recurrent-neural-network","display_name":"Recurrent neural network","score":0.9209392070770264},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.822765588760376},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7142737507820129},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.7012763023376465},{"id":"https://openalex.org/keywords/connectionism","display_name":"Connectionism","score":0.6549244523048401},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6407155394554138},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5986593961715698},{"id":"https://openalex.org/keywords/path","display_name":"Path (computing)","score":0.5698166489601135},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4329460859298706},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.33052289485931396},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.30485981702804565}],"concepts":[{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.9209392070770264},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.822765588760376},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7142737507820129},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.7012763023376465},{"id":"https://openalex.org/C8521452","wikidata":"https://www.wikidata.org/wiki/Q203790","display_name":"Connectionism","level":3,"score":0.6549244523048401},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6407155394554138},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5986593961715698},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.5698166489601135},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4329460859298706},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.33052289485931396},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.30485981702804565},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ijcnn52387.2021.9533348","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn52387.2021.9533348","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1828163288","https://openalex.org/W1979001355","https://openalex.org/W2110052520","https://openalex.org/W2125838338","https://openalex.org/W2127141656","https://openalex.org/W2293858598","https://openalex.org/W2515439472","https://openalex.org/W2750499125","https://openalex.org/W2761434131","https://openalex.org/W2911544293","https://openalex.org/W2949975180","https://openalex.org/W2963242190","https://openalex.org/W2963882470","https://openalex.org/W2970971581","https://openalex.org/W2972470300","https://openalex.org/W2973122799","https://openalex.org/W4239447739","https://openalex.org/W4253573210"],"related_works":["https://openalex.org/W2978673814","https://openalex.org/W2782005958","https://openalex.org/W3199016780","https://openalex.org/W2263232528","https://openalex.org/W2996122240","https://openalex.org/W2888131732","https://openalex.org/W2407648438","https://openalex.org/W3112480982","https://openalex.org/W2944322787","https://openalex.org/W2944384275"],"abstract_inverted_index":{"The":[0],"Recurrent":[1],"Neural":[2],"Network":[3],"Transducer":[4,62],"(RNN-T)":[5],"extends":[6],"Connectionist":[7],"Temporal":[8],"Classification":[9],"(CTC)":[10],"by":[11,33,139],"jointly":[12],"modeling":[13],"both":[14],"input-output":[15],"and":[16,19,127],"output-output":[17],"dependencies,":[18],"it":[20,128],"has":[21],"been":[22],"successfully":[23],"applied":[24],"in":[25,37,70,114],"end-to-end":[26],"speech":[27,124],"recognition.":[28],"RNN-T":[29,141],"adds":[30,93],"language":[31],"information":[32,78,81,95],"expanding":[34],"its":[35],"dimensions":[36,90],"this":[38],"way,":[39],"but":[40,91],"the":[41,50,59,67,74,88,97,105,111,115,119,135,140],"model":[42,51],"training":[43,89],"is":[44],"difficult.":[45],"And":[46,99],"some":[47],"paths":[48],"of":[49],"are":[52],"unreasonable":[53],"when":[54],"decoding.":[55],"Therefore,":[56],"we":[57,100],"present":[58],"Alignment":[60],"RNN":[61],"(ART)":[63],"algorithm,":[64],"which":[65],"uses":[66],"forward-backward":[68],"algorithm":[69,120],"CTC":[71,103],"to":[72,96,108],"find":[73],"best":[75],"path":[76,113],"alignment":[77],"as":[79,104],"context-related":[80],"for":[82],"training.":[83],"We":[84,117],"not":[85],"only":[86],"reduce":[87],"also":[92],"context":[94],"model.":[98,142],"still":[101],"use":[102],"loss":[106],"function":[107],"avoid":[109],"decoding":[110],"impossible":[112],"RNN-T.":[116],"verify":[118],"on":[121],"a":[122,130],"Mandarin":[123],"corpus":[125],"AIShell-1,":[126],"achieves":[129],"13.65%":[131],"CER,":[132],"compared":[133],"with":[134],"16.79%":[136],"CER":[137],"given":[138]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
