{"id":"https://openalex.org/W3128434617","doi":"https://doi.org/10.1109/slt48900.2021.9383523","title":"Online End-To-End Neural Diarization with Speaker-Tracing Buffer","display_name":"Online End-To-End Neural Diarization with Speaker-Tracing Buffer","publication_year":2021,"publication_date":"2021-01-19","ids":{"openalex":"https://openalex.org/W3128434617","doi":"https://doi.org/10.1109/slt48900.2021.9383523","mag":"3128434617"},"language":"en","primary_location":{"id":"doi:10.1109/slt48900.2021.9383523","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt48900.2021.9383523","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101012633","display_name":"Yawen Xue","orcid":null},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yawen Xue","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026324656","display_name":"Shota Horiguchi","orcid":"https://orcid.org/0000-0002-3166-4956"},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shota Horiguchi","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044818016","display_name":"Yusuke Fujita","orcid":"https://orcid.org/0000-0002-6523-8146"},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yusuke Fujita","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Center for Language and Speech Processing, Johns Hopkins University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Center for Language and Speech Processing, Johns Hopkins University","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059858850","display_name":"Leibny Paola Garcia","orcid":"https://orcid.org/0000-0002-7449-5726"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Paola Garcia","raw_affiliation_strings":["Center for Language and Speech Processing, Johns Hopkins University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Center for Language and Speech Processing, Johns Hopkins University","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5076987349","display_name":"Kenji Nagamatsu","orcid":null},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kenji Nagamatsu","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group","institution_ids":["https://openalex.org/I65143321"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.058,"has_fulltext":false,"cited_by_count":38,"citation_normalized_percentile":{"value":0.94698092,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"841","last_page":"848"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.889251708984375},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.806187093257904},{"id":"https://openalex.org/keywords/tracing","display_name":"Tracing","score":0.6753051280975342},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5736293792724609},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5279319286346436},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4991776943206787},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.4438365399837494},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.429205060005188},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.3350779712200165},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3212272822856903},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.07982531189918518}],"concepts":[{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.889251708984375},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.806187093257904},{"id":"https://openalex.org/C138673069","wikidata":"https://www.wikidata.org/wiki/Q322229","display_name":"Tracing","level":2,"score":0.6753051280975342},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5736293792724609},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5279319286346436},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4991776943206787},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.4438365399837494},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.429205060005188},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.3350779712200165},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3212272822856903},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.07982531189918518},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt48900.2021.9383523","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt48900.2021.9383523","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W9688279","https://openalex.org/W48568691","https://openalex.org/W67702164","https://openalex.org/W123007118","https://openalex.org/W1524333225","https://openalex.org/W1589137271","https://openalex.org/W1608667326","https://openalex.org/W1965819578","https://openalex.org/W1997873121","https://openalex.org/W2038101708","https://openalex.org/W2148613904","https://openalex.org/W2219249508","https://openalex.org/W2288817436","https://openalex.org/W2405137844","https://openalex.org/W2638067502","https://openalex.org/W2696967604","https://openalex.org/W2746574320","https://openalex.org/W2748488820","https://openalex.org/W2807429203","https://openalex.org/W2884797218","https://openalex.org/W2890964092","https://openalex.org/W2891247151","https://openalex.org/W2896538040","https://openalex.org/W2900440209","https://openalex.org/W2952752702","https://openalex.org/W2962788625","https://openalex.org/W2963470929","https://openalex.org/W2967957380","https://openalex.org/W2972949456","https://openalex.org/W3008104819","https://openalex.org/W3008357631","https://openalex.org/W3010196324","https://openalex.org/W3015308613","https://openalex.org/W3015621653","https://openalex.org/W3020336359","https://openalex.org/W3024104148","https://openalex.org/W3033627755","https://openalex.org/W3034729383","https://openalex.org/W3095212884","https://openalex.org/W4214556932","https://openalex.org/W6601947329","https://openalex.org/W6631362777","https://openalex.org/W6636488977","https://openalex.org/W6688816777","https://openalex.org/W6745415975","https://openalex.org/W6774558098","https://openalex.org/W6779069803","https://openalex.org/W6779469704"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2149220986","https://openalex.org/W1493012537","https://openalex.org/W4247736853","https://openalex.org/W2162158162","https://openalex.org/W1999004162","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2023466863","https://openalex.org/W2696990509"],"abstract_inverted_index":{"This":[0],"paper":[1],"proposes":[2],"a":[3,11,21,43,64,82],"novel":[4],"online":[5,130],"speaker":[6,31,54],"diarization":[7,18,89],"algorithm":[8],"based":[9],"on":[10],"fully":[12],"supervised":[13],"self-attention":[14,83],"mechanism":[15,46],"(SA-EEND).":[16],"Online":[17],"inherently":[19],"presents":[20],"speaker's":[22],"permutation":[23,55],"problem":[24],"due":[25],"to":[26,29,113],"the":[27,35,53,72,76,92,95,100,115,123],"possibility":[28],"assign":[30],"regions":[32],"incorrectly":[33],"across":[34,91],"recording.":[36],"To":[37],"circumvent":[38],"this":[39],"inconsistency,":[40],"we":[41,107],"proposed":[42],"speaker-tracing":[44,124],"buffer":[45,93,125],"that":[47],"selects":[48],"several":[49],"input":[50,73],"frames":[51,68,74],"representing":[52],"information":[56],"from":[57],"previous":[58],"chunks":[59],"and":[60,79,94,119,132,141],"stores":[61],"them":[62],"in":[63,75],"buffer.":[65],"These":[66],"buffered":[67],"are":[69],"stacked":[70],"with":[71,110,145],"current":[77,96],"chunk":[78,97],"fed":[80],"into":[81],"network.":[84],"Our":[85],"method":[86],"ensures":[87],"consistent":[88],"outputs":[90],"by":[98,122],"checking":[99],"correlation":[101],"between":[102,117],"their":[103],"corresponding":[104],"outputs.":[105],"Additionally,":[106],"trained":[108],"SA-EEND":[109,131],"variable":[111,133],"chunk-sizes":[112],"mitigate":[114],"mismatch":[116],"training":[118],"inference":[120],"introduced":[121],"mechanism.":[126],"Experimental":[127],"results,":[128],"including":[129],"chunk-size,":[134],"achieved":[135],"DERs":[136],"of":[137],"12.54%":[138],"for":[139,143],"CALLHOME":[140],"20.77%":[142],"CSJ":[144],"1.4":[146],"s":[147],"actual":[148],"latency.":[149]},"counts_by_year":[{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":9},{"year":2021,"cited_by_count":8},{"year":2020,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
