{"id":"https://openalex.org/W4372259971","doi":"https://doi.org/10.1109/icassp49357.2023.10095115","title":"Conformer-Based Target-Speaker Automatic Speech Recognition For Single-Channel Audio","display_name":"Conformer-Based Target-Speaker Automatic Speech Recognition For Single-Channel Audio","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372259971","doi":"https://doi.org/10.1109/icassp49357.2023.10095115"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095115","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095115","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2308.05218","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100354733","display_name":"Yang Zhang","orcid":"https://orcid.org/0000-0002-8540-1254"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yang Zhang","raw_affiliation_strings":["NVIDIA,USA","NVIDIA, USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,USA","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069691915","display_name":"Krishna C. Puvvada","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Krishna C. Puvvada","raw_affiliation_strings":["NVIDIA,USA","NVIDIA, USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,USA","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026088310","display_name":"Vitaly Lavrukhin","orcid":"https://orcid.org/0009-0006-7866-8301"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vitaly Lavrukhin","raw_affiliation_strings":["NVIDIA,USA","NVIDIA, USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,USA","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032957280","display_name":"Boris Ginsburg","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Boris Ginsburg","raw_affiliation_strings":["NVIDIA,USA","NVIDIA, USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,USA","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA, USA","institution_ids":["https://openalex.org/I4210127875"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100354733"],"corresponding_institution_ids":["https://openalex.org/I4210127875"],"apc_list":null,"apc_paid":null,"fwci":2.637,"has_fulltext":true,"cited_by_count":15,"citation_normalized_percentile":{"value":0.91626296,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.8190834522247314},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8090715408325195},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.8050515055656433},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.4800049662590027},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.45908230543136597},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.4422857165336609},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43382498621940613},{"id":"https://openalex.org/keywords/connectionism","display_name":"Connectionism","score":0.4281691908836365},{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.4188902974128723},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3411054313182831},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.16282868385314941}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.8190834522247314},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8090715408325195},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.8050515055656433},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.4800049662590027},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.45908230543136597},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4422857165336609},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43382498621940613},{"id":"https://openalex.org/C8521452","wikidata":"https://www.wikidata.org/wiki/Q203790","display_name":"Connectionism","level":3,"score":0.4281691908836365},{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.4188902974128723},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3411054313182831},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.16282868385314941},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095115","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095115","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2308.05218","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.05218","pdf_url":"https://arxiv.org/pdf/2308.05218","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2308.05218","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.05218","pdf_url":"https://arxiv.org/pdf/2308.05218","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4372259971.pdf"},"referenced_works_count":23,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2221409856","https://openalex.org/W2407080277","https://openalex.org/W2734774145","https://openalex.org/W2936774411","https://openalex.org/W2951130829","https://openalex.org/W2952218014","https://openalex.org/W2963574857","https://openalex.org/W2964058413","https://openalex.org/W3013139777","https://openalex.org/W3095173472","https://openalex.org/W3097643313","https://openalex.org/W3097777922","https://openalex.org/W3099330747","https://openalex.org/W3162847598","https://openalex.org/W3163652268","https://openalex.org/W3196194966","https://openalex.org/W3198522318","https://openalex.org/W3205878676","https://openalex.org/W4226491018","https://openalex.org/W4283320926","https://openalex.org/W4297841877"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2162158162","https://openalex.org/W4247736853","https://openalex.org/W1493012537","https://openalex.org/W1999004162","https://openalex.org/W2175373321","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2938358845","https://openalex.org/W1556857061"],"abstract_inverted_index":{"We":[0,77],"propose":[1],"CONF-TSASR,":[2],"a":[3,20,26,42,61],"non-autoregressive":[4],"end-to-end":[5],"time-frequency":[6],"domain":[7],"architecture":[8],"for":[9,91,108],"single-channel":[10],"target-speaker":[11,80],"automatic":[12],"speech":[13,46],"recognition":[14],"(TS-ASR).":[15],"The":[16,110],"model":[17,69,112],"consists":[18],"of":[19],"TitaNet":[21],"based":[22,28],"speaker":[23],"embedding":[24],"module,":[25],"Conformer":[27],"masking":[29],"as":[30,32],"well":[31],"ASR":[33],"modules.":[34],"These":[35],"modules":[36],"are":[37],"jointly":[38],"optimized":[39],"to":[40,66],"transcribe":[41],"target-speaker,":[43],"while":[44],"ignoring":[45],"from":[47,75],"other":[48],"speakers.":[49],"For":[50],"training":[51],"we":[52,89],"use":[53],"Connectionist":[54],"Temporal":[55],"Classification":[56],"(CTC)":[57],"loss":[58,65],"and":[59,101],"introduce":[60],"scale-invariant":[62],"spectrogram":[63,74],"reconstruction":[64],"encourage":[67],"the":[68,72,92],"better":[70],"separate":[71],"target-speaker\u2019s":[73],"mixture.":[76],"obtain":[78],"state-of-the-art":[79],"word":[81],"error":[82],"rate":[83],"(TS-WER)":[84],"on":[85,96],"WSJ0-2mix-extr":[86],"(4.2%).":[87],"Further,":[88],"report":[90],"first":[93],"time":[94],"TS-WER":[95],"WSJ0-3mix-extr":[97],"(12.4%),":[98],"LibriSpeech2Mix":[99],"(4.2%)":[100],"LibriSpeech3Mix":[102],"(7.6%)":[103],"datasets,":[104],"establishing":[105],"new":[106],"benchmarks":[107],"TS-ASR.":[109],"proposed":[111],"will":[113],"be":[114],"open-sourced":[115],"through":[116],"NVIDIA":[117],"NeMo":[118],"toolkit.":[119]},"counts_by_year":[{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":7}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2023-05-07T00:00:00"}
