{"id":"https://openalex.org/W4392904174","doi":"https://doi.org/10.1109/icassp48485.2024.10446661","title":"Neural Speaker Diarization Using Memory-Aware Multi-Speaker Embedding with Sequence-to-Sequence Architecture","display_name":"Neural Speaker Diarization Using Memory-Aware Multi-Speaker Embedding with Sequence-to-Sequence Architecture","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392904174","doi":"https://doi.org/10.1109/icassp48485.2024.10446661"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446661","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446661","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113145216","display_name":"Gaobin Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Gaobin Yang","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018657729","display_name":"Maokui He","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Maokui He","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064699619","display_name":"Shutong Niu","orcid":"https://orcid.org/0000-0003-0315-1042"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shutong Niu","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033777521","display_name":"Ruoyu Wang","orcid":"https://orcid.org/0000-0002-3644-1284"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruoyu Wang","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091088002","display_name":"Yanyan Yue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yanyan Yue","raw_affiliation_strings":["iFlytek Research,Hefei,China","iFlytek Research, Hefei, China"],"affiliations":[{"raw_affiliation_string":"iFlytek Research,Hefei,China","institution_ids":[]},{"raw_affiliation_string":"iFlytek Research, Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034371072","display_name":"Shuangqing Qian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shuangqing Qian","raw_affiliation_strings":["iFlytek Research,Hefei,China","iFlytek Research, Hefei, China"],"affiliations":[{"raw_affiliation_string":"iFlytek Research,Hefei,China","institution_ids":[]},{"raw_affiliation_string":"iFlytek Research, Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102344609","display_name":"Shilong Wu","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shilong Wu","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066595711","display_name":"Jun Du","orcid":"https://orcid.org/0000-0002-2387-0389"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Du","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066868860","display_name":"Chin\u2010Hui Lee","orcid":"https://orcid.org/0000-0002-1892-2551"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chin-Hui Lee","raw_affiliation_strings":["Georgia Institute of Technology,Atlanta,GA,USA","Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology,Atlanta,GA,USA","institution_ids":["https://openalex.org/I130701444"]},{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5113145216"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":3.2635,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.92537804,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"11626","last_page":"11630"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8096951246261597},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.775539219379425},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6436854600906372},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6359915733337402},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.5791054368019104},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5686604976654053},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.4977622330188751},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4410936236381531},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.41546595096588135},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.41449421644210815},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.390683114528656},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.1301788091659546}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8096951246261597},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.775539219379425},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6436854600906372},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6359915733337402},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.5791054368019104},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5686604976654053},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.4977622330188751},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4410936236381531},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.41546595096588135},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.41449421644210815},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.390683114528656},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.1301788091659546},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446661","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446661","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1607302042","https://openalex.org/W2038101708","https://openalex.org/W2081074144","https://openalex.org/W2093499222","https://openalex.org/W2150769028","https://openalex.org/W2765407302","https://openalex.org/W2890964092","https://openalex.org/W2962788625","https://openalex.org/W2963470929","https://openalex.org/W2989863749","https://openalex.org/W3008357631","https://openalex.org/W3016031604","https://openalex.org/W3024869864","https://openalex.org/W3025260599","https://openalex.org/W3097777922","https://openalex.org/W3140898556","https://openalex.org/W3195050710","https://openalex.org/W3196595845","https://openalex.org/W3212886388","https://openalex.org/W4226013992","https://openalex.org/W4362683557","https://openalex.org/W4375868885","https://openalex.org/W4389315128","https://openalex.org/W4389315135"],"related_works":["https://openalex.org/W4384929466","https://openalex.org/W2206035908","https://openalex.org/W3148366653","https://openalex.org/W2149220986","https://openalex.org/W1493012537","https://openalex.org/W4247736853","https://openalex.org/W2162158162","https://openalex.org/W1999004162","https://openalex.org/W2162582511","https://openalex.org/W2125642021"],"abstract_inverted_index":{"We":[0],"propose":[1],"a":[2,54,66,81,114,125],"novel":[3],"neural":[4],"speaker":[5],"diarization":[6,68],"system":[7,139],"using":[8],"memory-aware":[9,21],"multi-speaker":[10,22,130],"embedding":[11,23],"with":[12],"sequence-to-sequence":[13,26],"architecture":[14],"(NSD-MS2S),":[15],"which":[16,79],"integrates":[17],"the":[18,41,75,87,93,100,104,133,138,143],"strengths":[19],"of":[20,44,72,84,107],"(MA-MSE)":[24],"and":[25,35,51,91,127],"(Seq2Seq)":[27],"architecture,":[28],"leading":[29],"to":[30,58,98,122,136],"improvement":[31,83],"in":[32,119,142],"both":[33],"efficiency":[34],"performance.":[36],"Next,":[37],"we":[38,112,140],"further":[39],"decrease":[40],"memory":[42],"occupation":[43],"decoding":[45],"by":[46],"incorporating":[47],"input":[48],"features":[49,60],"fusion":[50],"then":[52],"employ":[53],"multi-head":[55],"attention":[56],"mechanism":[57],"capture":[59],"at":[61,151],"different":[62],"levels.":[63],"NSD-MS2S":[64],"achieved":[65],"macro":[67],"error":[69],"rate":[70],"(DER)":[71],"15.9%":[73],"on":[74],"CHiME-7":[76,108,144],"EVAL":[77],"set,":[78],"signifies":[80],"relative":[82],"49%":[85],"over":[86],"official":[88],"baseline":[89],"system,":[90],"is":[92,149],"key":[94],"technique":[95],"for":[96,103],"us":[97],"achieve":[99],"best":[101],"performance":[102],"main":[105],"track":[106],"DASR":[109,145],"Challenge.":[110,146],"Additionally,":[111],"introduce":[113],"deep":[115],"interactive":[116],"module":[117,121],"(DIM)":[118],"MA-MSE":[120],"better":[123],"retrieve":[124],"cleaner":[126],"more":[128],"discriminative":[129],"embedding,":[131],"enabling":[132],"current":[134],"model":[135],"outperform":[137],"used":[141],"Our":[147],"code":[148],"available":[150],"https://github.com/liyunlongaaa/NSD-MS2S.":[152]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
