{"id":"https://openalex.org/W3008230514","doi":"https://doi.org/10.1109/asru46091.2019.9003860","title":"Joint Optimization of Classification and Clustering for Deep Speaker Embedding","display_name":"Joint Optimization of Classification and Clustering for Deep Speaker Embedding","publication_year":2019,"publication_date":"2019-12-01","ids":{"openalex":"https://openalex.org/W3008230514","doi":"https://doi.org/10.1109/asru46091.2019.9003860","mag":"3008230514"},"language":"en","primary_location":{"id":"doi:10.1109/asru46091.2019.9003860","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru46091.2019.9003860","pdf_url":null,"source":{"id":"https://openalex.org/S4306498489","display_name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100372797","display_name":"Zhiming Wang","orcid":"https://orcid.org/0000-0001-8777-1729"},"institutions":[{"id":"https://openalex.org/I4210090985","display_name":"Zhejiang Financial College","ror":"https://ror.org/00deghz86","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210090985"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhiming Wang","raw_affiliation_strings":["Ant Financial Services Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Ant Financial Services Group, Hangzhou, China","institution_ids":["https://openalex.org/I4210090985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103119755","display_name":"Kaisheng Yao","orcid":"https://orcid.org/0000-0002-8949-9367"},"institutions":[{"id":"https://openalex.org/I4210090985","display_name":"Zhejiang Financial College","ror":"https://ror.org/00deghz86","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210090985"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kaisheng Yao","raw_affiliation_strings":["Ant Financial Services Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Ant Financial Services Group, Hangzhou, China","institution_ids":["https://openalex.org/I4210090985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090616461","display_name":"Shuo Fang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090985","display_name":"Zhejiang Financial College","ror":"https://ror.org/00deghz86","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210090985"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuo Fang","raw_affiliation_strings":["Ant Financial Services Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Ant Financial Services Group, Hangzhou, China","institution_ids":["https://openalex.org/I4210090985"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100371535","display_name":"Xiaolong Li","orcid":"https://orcid.org/0000-0001-7493-2650"},"institutions":[{"id":"https://openalex.org/I4210090985","display_name":"Zhejiang Financial College","ror":"https://ror.org/00deghz86","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210090985"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaolong Li","raw_affiliation_strings":["Ant Financial Services Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Ant Financial Services Group, Hangzhou, China","institution_ids":["https://openalex.org/I4210090985"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100372797"],"corresponding_institution_ids":["https://openalex.org/I4210090985"],"apc_list":null,"apc_paid":null,"fwci":0.5601,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.76365979,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":"2","issue":null,"first_page":"284","last_page":"290"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7894606590270996},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.6918670535087585},{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.6751267313957214},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.6722407341003418},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5980554223060608},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5813404321670532},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.5292477011680603},{"id":"https://openalex.org/keywords/speaker-identification","display_name":"Speaker identification","score":0.5149531960487366},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4904462993144989},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.4853023886680603},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.4841107130050659},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.45492875576019287},{"id":"https://openalex.org/keywords/mixture-model","display_name":"Mixture model","score":0.44656023383140564},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.2921391427516937},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.2568361759185791},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.06641343235969543}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7894606590270996},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.6918670535087585},{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.6751267313957214},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.6722407341003418},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5980554223060608},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5813404321670532},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.5292477011680603},{"id":"https://openalex.org/C2986627078","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker identification","level":3,"score":0.5149531960487366},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4904462993144989},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.4853023886680603},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4841107130050659},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.45492875576019287},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.44656023383140564},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2921391427516937},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2568361759185791},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.06641343235969543},{"id":"https://openalex.org/C50522688","wikidata":"https://www.wikidata.org/wiki/Q189833","display_name":"Economic growth","level":1,"score":0.0},{"id":"https://openalex.org/C170154142","wikidata":"https://www.wikidata.org/wiki/Q150737","display_name":"Architectural engineering","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru46091.2019.9003860","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru46091.2019.9003860","pdf_url":null,"source":{"id":"https://openalex.org/S4306498489","display_name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.6600000262260437}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1533861849","https://openalex.org/W1589137271","https://openalex.org/W2064364374","https://openalex.org/W2127672898","https://openalex.org/W2150769028","https://openalex.org/W2194775991","https://openalex.org/W2520774990","https://openalex.org/W2612434969","https://openalex.org/W2726515241","https://openalex.org/W2748488820","https://openalex.org/W2784163702","https://openalex.org/W2794506738","https://openalex.org/W2808631503","https://openalex.org/W2884412522","https://openalex.org/W2888897023","https://openalex.org/W2890964092","https://openalex.org/W2916104401","https://openalex.org/W2937328535","https://openalex.org/W2940070181","https://openalex.org/W2963166243","https://openalex.org/W2963371159","https://openalex.org/W2963382494","https://openalex.org/W2963466847","https://openalex.org/W2963656735","https://openalex.org/W2964247977","https://openalex.org/W2969985801","https://openalex.org/W3103152812","https://openalex.org/W4289750118","https://openalex.org/W6631943919","https://openalex.org/W6726946684","https://openalex.org/W6735013348","https://openalex.org/W6761649062"],"related_works":["https://openalex.org/W2982889384","https://openalex.org/W4226227567","https://openalex.org/W2971218105","https://openalex.org/W4287113729","https://openalex.org/W3173314472","https://openalex.org/W3103152812","https://openalex.org/W2128073728","https://openalex.org/W1965383186","https://openalex.org/W2129090883","https://openalex.org/W2972577568"],"abstract_inverted_index":{"This":[0],"paper":[1],"proposes":[2],"a":[3],"method":[4,102],"to":[5,24,37,55,60],"train":[6],"deep":[7],"speaker":[8,64,75,126],"embed-dings":[9],"end-to-end":[10],"that":[11,82],"jointly":[12],"optimizes":[13],"classification":[14,26,110,122],"and":[15,59,80,105,123,128],"clustering.":[16,39],"A":[17,28],"large":[18,30],"margin":[19,31],"softmax":[20],"loss":[21,34,111],"is":[22,35],"used":[23],"reduce":[25],"errors.":[27],"novel":[29],"Gaussian":[32],"mixture":[33],"proposed":[36],"improve":[38],"With":[40],"the":[41,44,93,98],"joint":[42,99,119],"optimization,":[43,100],"learned":[45],"embeddings":[46],"capture":[47],"segment-level":[48],"acoustic":[49],"representation":[50],"from":[51],"variable-length":[52],"speech":[53],"segments":[54],"discriminate":[56],"between":[57],"speakers":[58],"replicate":[61],"densities":[62],"of":[63,97,118,121],"clusters.":[65],"We":[66],"compare":[67],"performance":[68],"with":[69],"alternative":[70],"methods":[71,86],"on":[72,92],"large-scale":[73],"text-independent":[74],"recognition":[76],"dataset":[77],"VoxCeleb1":[78],"[1]":[79],"observe":[81],"it":[83],"outperforms":[84],"those":[85],"significantly,":[87],"achieving":[88],"new":[89],"state-of-the-art":[90],"results":[91,114],"dataset.":[94],"Moreover,":[95],"because":[96],"this":[101],"exhibits":[103],"faster":[104],"better":[106],"convergence":[107],"than":[108],"using":[109],"alone.":[112],"Our":[113],"suggest":[115],"great":[116],"potential":[117],"optimization":[120],"clustering":[124],"for":[125],"verification":[127],"identification.":[129]},"counts_by_year":[{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
