{"id":"https://openalex.org/W4221147513","doi":"https://doi.org/10.1109/icassp43922.2022.9747323","title":"Endpoint Detection for Streaming End-to-End Multi-Talker ASR","display_name":"Endpoint Detection for Streaming End-to-End Multi-Talker ASR","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W4221147513","doi":"https://doi.org/10.1109/icassp43922.2022.9747323"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9747323","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747323","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101607148","display_name":"Liang Lu","orcid":"https://orcid.org/0000-0003-4005-679X"},"institutions":[{"id":"https://openalex.org/I70848387","display_name":"Otter Tail Corporation (United States)","ror":"https://ror.org/02y85x967","country_code":"US","type":"company","lineage":["https://openalex.org/I70848387"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Liang Lu","raw_affiliation_strings":["Otter.ai,Mountain View,CA,USA","Otter.ai, Mountain View, CA, USA"],"affiliations":[{"raw_affiliation_string":"Otter.ai,Mountain View,CA,USA","institution_ids":[]},{"raw_affiliation_string":"Otter.ai, Mountain View, CA, USA","institution_ids":["https://openalex.org/I70848387"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365056","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-5206-8600"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft Corp,USA","Microsoft Corp, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp,USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Corp, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5077401426","display_name":"Yifan Gong","orcid":"https://orcid.org/0000-0001-8786-3391"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yifan Gong","raw_affiliation_strings":["Microsoft Corp,USA","Microsoft Corp, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp,USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Corp, USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101607148"],"corresponding_institution_ids":["https://openalex.org/I70848387"],"apc_list":null,"apc_paid":null,"fwci":1.2473,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.80909287,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7916046977043152},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7084777355194092},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7053792476654053},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.6883237361907959},{"id":"https://openalex.org/keywords/recurrent-neural-network","display_name":"Recurrent neural network","score":0.5499982237815857},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5494763255119324},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.5207131505012512},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4979250431060791},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.4302743077278137},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3743467926979065},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3345192074775696},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3212655186653137},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.06983578205108643}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7916046977043152},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7084777355194092},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7053792476654053},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.6883237361907959},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.5499982237815857},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5494763255119324},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.5207131505012512},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4979250431060791},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.4302743077278137},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3743467926979065},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3345192074775696},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3212655186653137},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.06983578205108643},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9747323","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747323","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7900000214576721,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1524333225","https://openalex.org/W1536583098","https://openalex.org/W1828163288","https://openalex.org/W1932883564","https://openalex.org/W1999454387","https://openalex.org/W2460742184","https://openalex.org/W2889503488","https://openalex.org/W2890244912","https://openalex.org/W2935756939","https://openalex.org/W2962784628","https://openalex.org/W2963574857","https://openalex.org/W2972818416","https://openalex.org/W3013139777","https://openalex.org/W3015746570","https://openalex.org/W3015927303","https://openalex.org/W3016010032","https://openalex.org/W3016232124","https://openalex.org/W3097643313","https://openalex.org/W3097973766","https://openalex.org/W3109079702","https://openalex.org/W3160133086","https://openalex.org/W3161873870","https://openalex.org/W3162847598","https://openalex.org/W3163907627","https://openalex.org/W3186652349","https://openalex.org/W3200955206","https://openalex.org/W4288088457","https://openalex.org/W6631362777","https://openalex.org/W6638749077","https://openalex.org/W6755552547","https://openalex.org/W6768009688","https://openalex.org/W6769806307","https://openalex.org/W6775489429","https://openalex.org/W6786783632","https://openalex.org/W6801135687"],"related_works":["https://openalex.org/W642007152","https://openalex.org/W2401827384","https://openalex.org/W2355290951","https://openalex.org/W2069501481","https://openalex.org/W3126788496","https://openalex.org/W2103239478","https://openalex.org/W2552102772","https://openalex.org/W1510046822","https://openalex.org/W2052688117","https://openalex.org/W4294771049"],"abstract_inverted_index":{"Streaming":[0,48],"end-to-end":[1,107,136],"multi-talker":[2,106],"speech":[3,10,35,39,74],"recognition":[4,40,75,179],"aims":[5],"at":[6],"transcribing":[7],"the":[8,34,38,47,73,82,114,119,132,151,160,166,178],"overlapped":[9],"from":[11,27],"conversations":[12],"or":[13],"meetings":[14],"with":[15],"an":[16,124,128],"all-neural":[17],"model":[18,54,168],"in":[19,118],"a":[20,28,85,142],"streaming":[21],"fashion,":[22],"which":[23],"is":[24,77],"fundamentally":[25],"different":[26],"modular-based":[29],"approach":[30,145],"that":[31,146,165],"usually":[32],"cascades":[33],"separation":[36],"and":[37,50,65],"models":[41],"trained":[42],"independently.":[43],"Previously,":[44],"we":[45,112,139],"proposed":[46],"Unmixing":[49],"Recognition":[51],"Transducer":[52],"(SURT)":[53],"based":[55,158],"on":[56,159],"recurrent":[57],"neural":[58],"network":[59],"transducer":[60],"(RNN-T)":[61],"for":[62,70,89,105],"this":[63,110],"problem":[64,117],"presented":[66],"promising":[67,171],"results.":[68],"However,":[69],"real":[71],"applications,":[72],"system":[76,91],"also":[78,140],"required":[79],"to":[80],"determine":[81],"times-tamp":[83],"when":[84],"speaker":[86],"finishes":[87],"speaking":[88],"prompt":[90],"response.":[92],"This":[93],"problem,":[94],"known":[95],"as":[96,127],"endpoint":[97],"(EP)":[98],"detection,":[99],"has":[100],"not":[101],"been":[102],"studied":[103],"previously":[104],"models.":[108,137],"In":[109],"work,":[111],"address":[113],"EP":[115,152,172],"detection":[116,153,173],"SURT":[120,167],"framework":[121],"by":[122],"introducing":[123],"end-of-sentence":[125],"token":[126],"output":[129],"unit,":[130],"following":[131],"practice":[133],"of":[134,177],"single-talker":[135],"Furthermore,":[138],"present":[141],"latency":[143],"penalty":[144],"can":[147,169],"significantly":[148,175],"cut":[149],"down":[150],"latency.":[154],"Our":[155],"experimental":[156],"results":[157],"2-speaker":[161],"LibrispeechMix":[162],"dataset":[163],"show":[164],"achieve":[170],"without":[174],"degradation":[176],"accuracy.":[180]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":7}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
