{"id":"https://openalex.org/W3161354387","doi":"https://doi.org/10.1109/icassp39728.2021.9414955","title":"Improving RNN Transducer with Target Speaker Extraction and Neural Uncertainty Estimation","display_name":"Improving RNN Transducer with Target Speaker Extraction and Neural Uncertainty Estimation","publication_year":2021,"publication_date":"2021-05-13","ids":{"openalex":"https://openalex.org/W3161354387","doi":"https://doi.org/10.1109/icassp39728.2021.9414955","mag":"3161354387"},"language":"en","primary_location":{"id":"doi:10.1109/icassp39728.2021.9414955","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414955","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008336983","display_name":"Jiatong Shi","orcid":"https://orcid.org/0000-0002-9050-8304"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jiatong Shi","raw_affiliation_strings":["Johns Hopkins University, USA"],"affiliations":[{"raw_affiliation_string":"Johns Hopkins University, USA","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005304261","display_name":"Chunlei Zhang","orcid":"https://orcid.org/0000-0002-3851-2357"},"institutions":[{"id":"https://openalex.org/I4210108985","display_name":"Bellevue Hospital Center","ror":"https://ror.org/01ky34z31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I1283621791","https://openalex.org/I4210086933","https://openalex.org/I4210108985"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chunlei Zhang","raw_affiliation_strings":["Tencent AI Lab, Bellevue WA, USA"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Bellevue WA, USA","institution_ids":["https://openalex.org/I4210108985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106404246","display_name":"Chao Weng","orcid":null},"institutions":[{"id":"https://openalex.org/I4210108985","display_name":"Bellevue Hospital Center","ror":"https://ror.org/01ky34z31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I1283621791","https://openalex.org/I4210086933","https://openalex.org/I4210108985"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chao Weng","raw_affiliation_strings":["Tencent AI Lab, Bellevue WA, USA"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Bellevue WA, USA","institution_ids":["https://openalex.org/I4210108985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Johns Hopkins University, USA"],"affiliations":[{"raw_affiliation_string":"Johns Hopkins University, USA","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106407019","display_name":"Meng Yu","orcid":"https://orcid.org/0000-0002-0031-9156"},"institutions":[{"id":"https://openalex.org/I4210108985","display_name":"Bellevue Hospital Center","ror":"https://ror.org/01ky34z31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I1283621791","https://openalex.org/I4210086933","https://openalex.org/I4210108985"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Meng Yu","raw_affiliation_strings":["Tencent AI Lab, Bellevue WA, USA"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Bellevue WA, USA","institution_ids":["https://openalex.org/I4210108985"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5034476404","display_name":"Dong Yu","orcid":"https://orcid.org/0000-0003-0520-6844"},"institutions":[{"id":"https://openalex.org/I4210108985","display_name":"Bellevue Hospital Center","ror":"https://ror.org/01ky34z31","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I1283621791","https://openalex.org/I4210086933","https://openalex.org/I4210108985"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dong Yu","raw_affiliation_strings":["Tencent AI Lab, Bellevue WA, USA"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Bellevue WA, USA","institution_ids":["https://openalex.org/I4210108985"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5008336983"],"corresponding_institution_ids":["https://openalex.org/I145311948"],"apc_list":null,"apc_paid":null,"fwci":0.9142,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.73360379,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"6908","last_page":"6912"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8070324063301086},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.790280818939209},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5930367708206177},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5655869841575623},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.45804283022880554},{"id":"https://openalex.org/keywords/noise-measurement","display_name":"Noise measurement","score":0.45042335987091064},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.44463565945625305},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.437059611082077},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4228581488132477},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4189332127571106},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.41446250677108765},{"id":"https://openalex.org/keywords/recurrent-neural-network","display_name":"Recurrent neural network","score":0.41064825654029846},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3471505641937256},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.14223900437355042},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.09386441111564636},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.0915830135345459}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8070324063301086},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.790280818939209},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5930367708206177},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5655869841575623},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.45804283022880554},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.45042335987091064},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.44463565945625305},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.437059611082077},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4228581488132477},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4189332127571106},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.41446250677108765},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.41064825654029846},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3471505641937256},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.14223900437355042},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.09386441111564636},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0915830135345459},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C170154142","wikidata":"https://www.wikidata.org/wiki/Q150737","display_name":"Architectural engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp39728.2021.9414955","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414955","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.699999988079071}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W1598508708","https://openalex.org/W1828163288","https://openalex.org/W2042141988","https://openalex.org/W2066086651","https://openalex.org/W2077297191","https://openalex.org/W2145475542","https://openalex.org/W2304609584","https://openalex.org/W2345067732","https://openalex.org/W2584063687","https://openalex.org/W2600628583","https://openalex.org/W2606436965","https://openalex.org/W2640112133","https://openalex.org/W2747238065","https://openalex.org/W2749510669","https://openalex.org/W2749659773","https://openalex.org/W2754060818","https://openalex.org/W2802973008","https://openalex.org/W2887080793","https://openalex.org/W2889048668","https://openalex.org/W2890244912","https://openalex.org/W2891833136","https://openalex.org/W2939497224","https://openalex.org/W2951130829","https://openalex.org/W2952218014","https://openalex.org/W2954695182","https://openalex.org/W2962824709","https://openalex.org/W2963414781","https://openalex.org/W2972451902","https://openalex.org/W2972818416","https://openalex.org/W2973062255","https://openalex.org/W3007328579","https://openalex.org/W3015219411","https://openalex.org/W3015306128","https://openalex.org/W3016131876","https://openalex.org/W3032969657","https://openalex.org/W3083872384","https://openalex.org/W3096090308","https://openalex.org/W3099330747","https://openalex.org/W6638749077","https://openalex.org/W6732766861","https://openalex.org/W6735168207","https://openalex.org/W6747398299","https://openalex.org/W6754473786","https://openalex.org/W6773419339"],"related_works":["https://openalex.org/W2418631473","https://openalex.org/W2485008119","https://openalex.org/W2355125052","https://openalex.org/W2810291168","https://openalex.org/W4247725880","https://openalex.org/W3175075966","https://openalex.org/W1580555281","https://openalex.org/W3147117728","https://openalex.org/W1494547797","https://openalex.org/W2137069055"],"abstract_inverted_index":{"Target-speaker":[0],"speech":[1,7,27,60,76,87],"recognition":[2],"aims":[3],"to":[4,66,80],"recognize":[5],"target-speaker":[6,26],"from":[8,73],"noisy":[9,128],"environments":[10],"with":[11,84,110],"background":[12,111],"noise":[13,70],"and":[14,29,47,59,71],"interfering":[15],"speakers.":[16],"This":[17],"work":[18],"presents":[19],"a":[20,41,81,85],"joint":[21],"framework":[22],"that":[23,45,93,117],"combines":[24],"time-domain":[25],"extraction":[28,77,88],"Recurrent":[30],"Neural":[31],"Network":[32],"Transducer":[33],"(RNN-T).":[34],"To":[35],"stabilize":[36],"the":[37,52,74,95,127,132,135],"joint-training,":[38],"we":[39],"propose":[40],"multi-stage":[42],"training":[43],"strategy":[44],"pre-trains":[46],"fine-tunes":[48],"each":[49],"module":[50,98],"in":[51,126,134],"system":[53],"before":[54],"joint-training.":[55],"Meanwhile,":[56],"speaker":[57],"identity":[58],"enhancement":[61],"uncertainty":[62,97],"measures":[63],"are":[64],"proposed":[65],"compensate":[67],"for":[68],"residual":[69],"artifacts":[72],"target":[75,86],"module.":[78],"Compared":[79],"recognizer":[82],"fine-tuned":[83],"model,":[89],"our":[90,118],"experiments":[91,115],"show":[92],"adding":[94],"neural":[96],"significantly":[99],"reduces":[100],"17%":[101],"relative":[102,123],"Character":[103],"Error":[104],"Rate":[105],"(CER)":[106],"on":[107],"multi-speaker":[108],"signals":[109],"noise.":[112],"The":[113],"multi-condition":[114],"indicate":[116],"method":[119],"can":[120],"achieve":[121],"9%":[122],"performance":[124,133],"gain":[125],"condition":[129],"while":[130],"maintaining":[131],"clean":[136],"condition.":[137]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
