{"id":"https://openalex.org/W4392903601","doi":"https://doi.org/10.1109/icassp48485.2024.10448494","title":"Multi-View Speaker Embedding Learning for Enhanced Stability and Discriminability","display_name":"Multi-View Speaker Embedding Learning for Enhanced Stability and Discriminability","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903601","doi":"https://doi.org/10.1109/icassp48485.2024.10448494"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10448494","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448494","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049944728","display_name":"Liang He","orcid":"https://orcid.org/0000-0003-4076-7479"},"institutions":[{"id":"https://openalex.org/I96908189","display_name":"Xinjiang University","ror":"https://ror.org/059gw8r13","country_code":"CN","type":"education","lineage":["https://openalex.org/I96908189"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang He","raw_affiliation_strings":["Xinjiang University,School of Computer Science and Technology,Urumqi,China","Xinjiang Key Laboratory of Signal Detection and Processing, Urumqi, China","School of Computer Science and Technology, Xinjiang University, Urumqi, China","Department of Electronic Engineering, Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Xinjiang University,School of Computer Science and Technology,Urumqi,China","institution_ids":["https://openalex.org/I96908189"]},{"raw_affiliation_string":"Xinjiang Key Laboratory of Signal Detection and Processing, Urumqi, China","institution_ids":[]},{"raw_affiliation_string":"School of Computer Science and Technology, Xinjiang University, Urumqi, China","institution_ids":["https://openalex.org/I96908189"]},{"raw_affiliation_string":"Department of Electronic Engineering, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000449705","display_name":"Zhihua Fang","orcid":"https://orcid.org/0000-0002-3018-7414"},"institutions":[{"id":"https://openalex.org/I96908189","display_name":"Xinjiang University","ror":"https://ror.org/059gw8r13","country_code":"CN","type":"education","lineage":["https://openalex.org/I96908189"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhihua Fang","raw_affiliation_strings":["Xinjiang University,School of Computer Science and Technology,Urumqi,China","Xinjiang Key Laboratory of Signal Detection and Processing, Urumqi, China","School of Computer Science and Technology, Xinjiang University, Urumqi, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Xinjiang University,School of Computer Science and Technology,Urumqi,China","institution_ids":["https://openalex.org/I96908189"]},{"raw_affiliation_string":"Xinjiang Key Laboratory of Signal Detection and Processing, Urumqi, China","institution_ids":[]},{"raw_affiliation_string":"School of Computer Science and Technology, Xinjiang University, Urumqi, China","institution_ids":["https://openalex.org/I96908189"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008850047","display_name":"Zuoer Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zuoer Chen","raw_affiliation_strings":["Tsinghua University,Department of Electronic Engineering,Beijing,China","Department of Electronic Engineering, Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Electronic Engineering,Beijing,China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Department of Electronic Engineering, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100413867","display_name":"Minqiang Xu","orcid":"https://orcid.org/0000-0003-3625-1736"},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minqiang Xu","raw_affiliation_strings":["iFly Digital Technology,Hefei,China","iFly Digital Technology, Hefei, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"iFly Digital Technology,Hefei,China","institution_ids":["https://openalex.org/I16365422"]},{"raw_affiliation_string":"iFly Digital Technology, Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113580327","display_name":"Ying Meng","orcid":null},"institutions":[{"id":"https://openalex.org/I96908189","display_name":"Xinjiang University","ror":"https://ror.org/059gw8r13","country_code":"CN","type":"education","lineage":["https://openalex.org/I96908189"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Meng","raw_affiliation_strings":["Xinjiang University,School of Computer Science and Technology,Urumqi,China","Xinjiang Key Laboratory of Signal Detection and Processing, Urumqi, China","School of Computer Science and Technology, Xinjiang University, Urumqi, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Xinjiang University,School of Computer Science and Technology,Urumqi,China","institution_ids":["https://openalex.org/I96908189"]},{"raw_affiliation_string":"Xinjiang Key Laboratory of Signal Detection and Processing, Urumqi, China","institution_ids":[]},{"raw_affiliation_string":"School of Computer Science and Technology, Xinjiang University, Urumqi, China","institution_ids":["https://openalex.org/I96908189"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101510763","display_name":"Penghao Wang","orcid":"https://orcid.org/0009-0005-8431-5388"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Penghao Wang","raw_affiliation_strings":["Tsinghua University,Department of Electronic Engineering,Beijing,China","Department of Electronic Engineering, Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Electronic Engineering,Beijing,China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Department of Electronic Engineering, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.6109,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.71065896,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"10081","last_page":"10085"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9908000230789185,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.804894745349884},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7613459825515747},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7099155187606812},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.651067852973938},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6421092748641968},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.6174074411392212},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.615066647529602},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5792053937911987},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.557047963142395},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.5566377639770508},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5391443371772766},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5110150575637817},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5012834072113037},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.4881966710090637},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4225970208644867},{"id":"https://openalex.org/keywords/speaker-verification","display_name":"Speaker verification","score":0.41587987542152405},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.22601699829101562}],"concepts":[{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.804894745349884},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7613459825515747},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7099155187606812},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.651067852973938},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6421092748641968},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.6174074411392212},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.615066647529602},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5792053937911987},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.557047963142395},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.5566377639770508},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5391443371772766},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5110150575637817},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5012834072113037},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.4881966710090637},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4225970208644867},{"id":"https://openalex.org/C2982762665","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker verification","level":3,"score":0.41587987542152405},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.22601699829101562},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10448494","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448494","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.800000011920929,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[{"id":"https://openalex.org/G2224866389","display_name":null,"funder_award_id":"62366051","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1985867823","https://openalex.org/W2219249508","https://openalex.org/W2696967604","https://openalex.org/W2726515241","https://openalex.org/W2784163702","https://openalex.org/W2808631503","https://openalex.org/W2883725317","https://openalex.org/W2890964092","https://openalex.org/W2916104401","https://openalex.org/W2963902346","https://openalex.org/W3024869864","https://openalex.org/W3038871978","https://openalex.org/W3103152812","https://openalex.org/W3111481301","https://openalex.org/W3135048627","https://openalex.org/W3137249133","https://openalex.org/W3160743249","https://openalex.org/W3162347631","https://openalex.org/W3162899647","https://openalex.org/W3166898278","https://openalex.org/W3198161435","https://openalex.org/W4372271885","https://openalex.org/W4385822752","https://openalex.org/W6688816777","https://openalex.org/W6787439801"],"related_works":["https://openalex.org/W4384929466","https://openalex.org/W2206035908","https://openalex.org/W66821593","https://openalex.org/W2053159670","https://openalex.org/W3148366653","https://openalex.org/W2149220986","https://openalex.org/W1521299571","https://openalex.org/W1493012537","https://openalex.org/W2144470400","https://openalex.org/W2911612049"],"abstract_inverted_index":{"Deep":[0],"neural":[1],"network":[2],"models":[3],"based":[4,42],"on":[5,43,86,98,144],"x-vector":[6],"have":[7],"become":[8],"the":[9,17,36,51,58,71,78,88,94,100,104,115,128,139,142,153,156],"most":[10,37],"popular":[11,38],"framework":[12],"for":[13,25],"speaker":[14,20,30,33,79,135,146,149],"recognition,":[15],"and":[16,32,122,133,148,152],"quality":[18],"of":[19,90,103,114,130,141,158],"features":[21],"(embeddings)":[22],"is":[23,41],"important":[24],"open-set":[26],"tasks":[27],"such":[28],"as":[29],"verification":[31,147],"diarization.":[34],"Currently,":[35],"loss":[39],"function":[40],"margin":[44],"penalty,":[45],"however,":[46],"it":[47],"only":[48],"considers":[49],"enlarging":[50],"inter-class":[52,120],"distance":[53,121],"while":[54,93],"neglecting":[55],"to":[56],"reduce":[57],"intra-class":[59,108,124],"feature":[60,101],"differences.":[61],"Therefore,":[62],"we":[63],"propose":[64],"a":[65],"multi-view":[66],"learning":[67],"approach":[68],"that":[69],"divides":[70],"training":[72],"process":[73],"into":[74],"two":[75,116],"views":[76],"from":[77],"embedding":[80],"level.":[81],"The":[82,111],"classification":[83],"view":[84,96],"focuses":[85,97],"distinguishing":[87],"discriminability":[89],"different":[91],"speakers,":[92],"clustering":[95],"shrinking":[99],"boundaries":[102],"same":[105],"speaker,":[106],"making":[107],"differences":[109],"smaller.":[110],"combined":[112],"effect":[113],"perspectives":[117],"achieves":[118],"large":[119],"small":[123],"distances,":[125],"resulting":[126],"in":[127],"extraction":[129],"more":[131],"discriminative":[132],"stable":[134],"embeddings.":[136],"We":[137],"test":[138],"performance":[140],"method":[143],"both":[145],"diarization":[150],"tasks,":[151],"results":[154],"demonstrate":[155],"effectiveness":[157],"our":[159],"approach.":[160]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
