{"id":"https://openalex.org/W3008357631","doi":"https://doi.org/10.1109/asru46091.2019.9003959","title":"End-to-End Neural Speaker Diarization with Self-Attention","display_name":"End-to-End Neural Speaker Diarization with Self-Attention","publication_year":2019,"publication_date":"2019-12-01","ids":{"openalex":"https://openalex.org/W3008357631","doi":"https://doi.org/10.1109/asru46091.2019.9003959","mag":"3008357631"},"language":"en","primary_location":{"id":"doi:10.1109/asru46091.2019.9003959","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru46091.2019.9003959","pdf_url":null,"source":{"id":"https://openalex.org/S4306498489","display_name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5044818016","display_name":"Yusuke Fujita","orcid":"https://orcid.org/0000-0002-6523-8146"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yusuke Fujita","raw_affiliation_strings":["Center for Language and Speech Processing, Johns Hopkins University, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Center for Language and Speech Processing, Johns Hopkins University, USA","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016279564","display_name":"Naoyuki Kanda","orcid":"https://orcid.org/0000-0002-8628-3288"},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Naoyuki Kanda","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group, Japan","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026324656","display_name":"Shota Horiguchi","orcid":"https://orcid.org/0000-0002-3166-4956"},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shota Horiguchi","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group, Japan","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101012633","display_name":"Yawen Xue","orcid":null},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yawen Xue","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group, Japan","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076987349","display_name":"Kenji Nagamatsu","orcid":null},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kenji Nagamatsu","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group, Japan","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Center for Language and Speech Processing, Johns Hopkins University, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Center for Language and Speech Processing, Johns Hopkins University, USA","institution_ids":["https://openalex.org/I145311948"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":13.7388,"has_fulltext":false,"cited_by_count":203,"citation_normalized_percentile":{"value":0.99047517,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"296","last_page":"303"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.8982255458831787},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8301458358764648},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.6771819591522217},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6689381003379822},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.6041056513786316},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.47825896739959717},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.44875064492225647},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.44497692584991455},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4115685820579529},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.36985570192337036}],"concepts":[{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.8982255458831787},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8301458358764648},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.6771819591522217},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6689381003379822},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.6041056513786316},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.47825896739959717},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.44875064492225647},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.44497692584991455},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4115685820579529},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36985570192337036},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru46091.2019.9003959","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru46091.2019.9003959","pdf_url":null,"source":{"id":"https://openalex.org/S4306498489","display_name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.4000000059604645,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":70,"referenced_works":["https://openalex.org/W854541894","https://openalex.org/W1485783873","https://openalex.org/W1522301498","https://openalex.org/W1524333225","https://openalex.org/W1591607137","https://openalex.org/W1965819578","https://openalex.org/W2038101708","https://openalex.org/W2079735306","https://openalex.org/W2081074144","https://openalex.org/W2093499222","https://openalex.org/W2130942839","https://openalex.org/W2133564696","https://openalex.org/W2150769028","https://openalex.org/W2159591770","https://openalex.org/W2170579896","https://openalex.org/W2219249508","https://openalex.org/W2221409856","https://openalex.org/W2327501763","https://openalex.org/W2460742184","https://openalex.org/W2597655663","https://openalex.org/W2620757702","https://openalex.org/W2638067502","https://openalex.org/W2696967604","https://openalex.org/W2734774145","https://openalex.org/W2746574320","https://openalex.org/W2766219058","https://openalex.org/W2786458517","https://openalex.org/W2884797218","https://openalex.org/W2889031312","https://openalex.org/W2889256969","https://openalex.org/W2889381673","https://openalex.org/W2889418727","https://openalex.org/W2889519245","https://openalex.org/W2890964092","https://openalex.org/W2891247151","https://openalex.org/W2892009249","https://openalex.org/W2896538040","https://openalex.org/W2900091092","https://openalex.org/W2900212944","https://openalex.org/W2900440209","https://openalex.org/W2901997113","https://openalex.org/W2938358845","https://openalex.org/W2939690918","https://openalex.org/W2962788625","https://openalex.org/W2962918106","https://openalex.org/W2963091558","https://openalex.org/W2963386218","https://openalex.org/W2963403868","https://openalex.org/W2963470929","https://openalex.org/W2963609956","https://openalex.org/W2964089206","https://openalex.org/W2964121744","https://openalex.org/W2964308564","https://openalex.org/W2972949456","https://openalex.org/W2980088508","https://openalex.org/W3034729383","https://openalex.org/W4285719527","https://openalex.org/W4294589828","https://openalex.org/W4385245566","https://openalex.org/W6623517193","https://openalex.org/W6631190155","https://openalex.org/W6631362777","https://openalex.org/W6679434410","https://openalex.org/W6679436768","https://openalex.org/W6735377749","https://openalex.org/W6738902873","https://openalex.org/W6739901393","https://openalex.org/W6756197946","https://openalex.org/W6761275059","https://openalex.org/W6779469704"],"related_works":["https://openalex.org/W1521049138","https://openalex.org/W1509309911","https://openalex.org/W2144208207","https://openalex.org/W2499802997","https://openalex.org/W1497807607","https://openalex.org/W1813780412","https://openalex.org/W2128773298","https://openalex.org/W2160753975","https://openalex.org/W1940231550","https://openalex.org/W2118860825"],"abstract_inverted_index":{"Speaker":[0],"diarization":[1,29,61,117],"has":[2,17],"been":[3],"mainly":[4],"developed":[5],"based":[6],"on":[7,92,103,124],"the":[8,14,44,105,115,139,142,157,169,177,183],"clustering":[9],"of":[10,81,168],"speaker":[11,37,60,116,188],"embeddings.":[12],"However,":[13],"clustering-based":[15,172],"approach":[16],"two":[18],"major":[19],"problems;":[20],"i.e.,":[21],"(i)":[22],"it":[23,34,109],"is":[24,89,100,200],"not":[25],"optimized":[26],"to":[27,86,144,192],"minimize":[28],"errors":[30],"directly,":[31],"and":[32,95,130,148],"(ii)":[33],"cannot":[35],"handle":[36],"overlaps":[38],"correctly.":[39],"To":[40],"solve":[41],"these":[42],"problems,":[43],"End-to-End":[45],"Neural":[46],"Diarization":[47],"(EEND),":[48],"in":[49,190],"which":[50,88],"a":[51,64],"bidirectional":[52],"long":[53],"short-term":[54],"memory":[55],"(BLSTM)":[56],"network":[57],"directly":[58,101],"outputs":[59],"results":[62,136],"given":[63],"multi-talker":[65],"recording,":[66],"was":[67,141,163],"recently":[68],"proposed.":[69],"In":[70,84],"this":[71],"study,":[72],"we":[73,180],"enhance":[74],"EEND":[75],"by":[76,175],"introducing":[77],"self-attention":[78,99,140,184],"blocks":[79],"instead":[80],"BLSTM":[82],"blocks.":[83],"contrast":[85],"BLSTM,":[87],"conditioned":[90,102],"only":[91],"its":[93],"previous":[94],"next":[96],"hidden":[97],"states,":[98],"all":[104],"other":[106],"frames,":[107],"making":[108],"much":[110],"suitable":[111],"for":[112],"dealing":[113],"with":[114],"problem.":[118],"We":[119],"evaluated":[120],"our":[121,150],"proposed":[122,151],"method":[123,152,162],"simulated":[125],"mixtures,":[126],"real":[127,131],"telephone":[128],"calls,":[129],"dialogue":[132],"recordings.":[133],"The":[134],"experimental":[135],"revealed":[137],"that":[138,149,167,182],"key":[143],"achieving":[145],"good":[146],"performance":[147],"performed":[153],"significantly":[154],"better":[155,165],"than":[156,166],"conventional":[158],"BLSTM-based":[159],"method.":[160,173],"Our":[161,197],"even":[164],"state-of-the-art":[170],"x-vector":[171],"Finally,":[174],"visualizing":[176],"latent":[178],"representation,":[179],"show":[181],"can":[185],"capture":[186],"global":[187],"characteristics":[189],"addition":[191],"local":[193],"speech":[194],"activity":[195],"dynamics.":[196],"source":[198],"code":[199],"available":[201],"online":[202],"at":[203],"https://github.com/hitachi-speech/EEND.":[204]},"counts_by_year":[{"year":2026,"cited_by_count":7},{"year":2025,"cited_by_count":33},{"year":2024,"cited_by_count":29},{"year":2023,"cited_by_count":38},{"year":2022,"cited_by_count":45},{"year":2021,"cited_by_count":30},{"year":2020,"cited_by_count":17},{"year":2019,"cited_by_count":3},{"year":2018,"cited_by_count":1}],"updated_date":"2026-06-16T09:24:06.705377","created_date":"2025-10-10T00:00:00"}
