{"id":"https://openalex.org/W3186169793","doi":"https://doi.org/10.1109/ijcb52358.2021.9484331","title":"Learning Discriminative Speaker Embedding by Improving Aggregation Strategy and Loss Function for Speaker Verification","display_name":"Learning Discriminative Speaker Embedding by Improving Aggregation Strategy and Loss Function for Speaker Verification","publication_year":2021,"publication_date":"2021-07-20","ids":{"openalex":"https://openalex.org/W3186169793","doi":"https://doi.org/10.1109/ijcb52358.2021.9484331","mag":"3186169793"},"language":"en","primary_location":{"id":"doi:10.1109/ijcb52358.2021.9484331","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcb52358.2021.9484331","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE International Joint Conference on Biometrics (IJCB)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015237477","display_name":"Chengfang Luo","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chengfang Luo","raw_affiliation_strings":["South China University of Technology, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051687192","display_name":"Xin Guo","orcid":"https://orcid.org/0000-0001-7727-0568"},"institutions":[{"id":"https://openalex.org/I4210122543","display_name":"Guangdong Polytechnic Normal University","ror":"https://ror.org/02pcb5m77","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210122543"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Guo","raw_affiliation_strings":["Guangdong Communication Polytechnic, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"Guangdong Communication Polytechnic, Guangzhou, China","institution_ids":["https://openalex.org/I4210122543"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032448599","display_name":"Aiwen Deng","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Aiwen Deng","raw_affiliation_strings":["South China University of Technology, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085260822","display_name":"Wei Xu","orcid":"https://orcid.org/0009-0001-9354-0297"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Xu","raw_affiliation_strings":["South China University of Technology, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048022062","display_name":"Junhong Zhao","orcid":"https://orcid.org/0000-0001-7031-3828"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junhong Zhao","raw_affiliation_strings":["South China University of Technology, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5089840086","display_name":"Wenxiong Kang","orcid":"https://orcid.org/0000-0001-9023-7252"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenxiong Kang","raw_affiliation_strings":["South China University of Technology, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5015237477"],"corresponding_institution_ids":["https://openalex.org/I90610280"],"apc_list":null,"apc_paid":null,"fwci":1.3597,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.8449971,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.9590467214584351},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.8672689199447632},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7928048372268677},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.7357519865036011},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5829548239707947},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5654206275939941},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.557224452495575},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.508300244808197},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4985644817352295},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4930119216442108},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.48561277985572815},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.4705256223678589},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.44479039311408997},{"id":"https://openalex.org/keywords/aggregate","display_name":"Aggregate (composite)","score":0.41626590490341187}],"concepts":[{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.9590467214584351},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.8672689199447632},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7928048372268677},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.7357519865036011},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5829548239707947},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5654206275939941},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.557224452495575},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.508300244808197},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4985644817352295},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4930119216442108},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.48561277985572815},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.4705256223678589},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.44479039311408997},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.41626590490341187},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ijcb52358.2021.9484331","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcb52358.2021.9484331","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE International Joint Conference on Biometrics (IJCB)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6800000071525574,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335795","display_name":"Science and Technology Planning Project of Guangdong Province","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":39,"referenced_works":["https://openalex.org/W2601450892","https://openalex.org/W2620629206","https://openalex.org/W2726515241","https://openalex.org/W2748488820","https://openalex.org/W2752782242","https://openalex.org/W2794506738","https://openalex.org/W2808631503","https://openalex.org/W2888968865","https://openalex.org/W2900560726","https://openalex.org/W2916104401","https://openalex.org/W2916301830","https://openalex.org/W2939763029","https://openalex.org/W2951019013","https://openalex.org/W2963314614","https://openalex.org/W2963371159","https://openalex.org/W2963386851","https://openalex.org/W2963420686","https://openalex.org/W2964154960","https://openalex.org/W2964216323","https://openalex.org/W2969985801","https://openalex.org/W2972441390","https://openalex.org/W2972627751","https://openalex.org/W2972885011","https://openalex.org/W2972986505","https://openalex.org/W3013020904","https://openalex.org/W3015574842","https://openalex.org/W3020953549","https://openalex.org/W3044308976","https://openalex.org/W3096235116","https://openalex.org/W3097000690","https://openalex.org/W3147324749","https://openalex.org/W3149926570","https://openalex.org/W4394665180","https://openalex.org/W6735236233","https://openalex.org/W6755942961","https://openalex.org/W6761233929","https://openalex.org/W6781860096","https://openalex.org/W6786547643","https://openalex.org/W6864750640"],"related_works":["https://openalex.org/W3095152779","https://openalex.org/W3119773509","https://openalex.org/W3128220219","https://openalex.org/W4384929466","https://openalex.org/W2206035908","https://openalex.org/W3148366653","https://openalex.org/W2149220986","https://openalex.org/W1493012537","https://openalex.org/W4247736853","https://openalex.org/W2162158162"],"abstract_inverted_index":{"The":[0,167],"embedding-based":[1],"speaker":[2,27,54,99],"verification":[3],"(SV)":[4],"technology":[5],"has":[6],"witnessed":[7],"significant":[8,155],"progress":[9],"due":[10],"to":[11,22,62,120],"the":[12,24,30,37,43,50,68,94,102,122,141,145,160],"advances":[13],"of":[14,26,39,53,75,96,109,126,170],"deep":[15],"convolutional":[16],"neural":[17],"networks":[18],"(DCNN).":[19],"However,":[20],"how":[21],"improve":[23,49],"discrimination":[25],"embedding":[28,55],"in":[29,42,105],"open":[31],"world":[32],"SV":[33],"task":[34],"is":[35,60,91,118,173],"still":[36],"focus":[38],"current":[40],"research":[41],"community.":[44],"In":[45],"this":[46,171],"paper,":[47],"we":[48],"discriminative":[51],"power":[52],"from":[56,101],"three-fold:":[57],"(1)":[58],"NeXtVLAD":[59,90],"introduced":[61],"aggregate":[63],"frame-level":[64,70,103],"features,":[65],"which":[66,124],"decomposes":[67],"high-dimensional":[69],"features":[71],"into":[72],"a":[73,127,131],"group":[74],"low-dimensional":[76],"vectors":[77],"before":[78],"applying":[79],"VLAD":[80],"aggregation.":[81],"(2)":[82],"A":[83,112],"multi-scale":[84],"aggregation":[85],"strategy":[86],"(MSA)":[87],"assembled":[88],"with":[89,93,159],"designed":[92],"purpose":[95],"fully":[97],"extract":[98],"information":[100],"feature":[104],"different":[106],"hidden":[107],"layers":[108],"DCNN.":[110],"(3)":[111],"mutually":[113],"complementary":[114],"assembling":[115],"loss":[116,129],"function":[117],"proposed":[119,151],"train":[121],"model,":[123],"consists":[125],"prototypical":[128],"and":[130,144,162],"marginal-based":[132],"softmax":[133],"loss.":[134],"Extensive":[135],"experiments":[136],"have":[137],"been":[138],"conducted":[139],"on":[140],"VoxCeleb-1":[142],"dataset,":[143],"experimental":[146],"results":[147],"show":[148],"that":[149],"our":[150],"system":[152],"can":[153],"obtain":[154],"performance":[156],"improvements":[157],"compared":[158],"baseline,":[161],"obtains":[163],"new":[164],"state-of-the-art":[165],"results.":[166],"source":[168],"code":[169],"paper":[172],"available":[174],"at":[175],"https://github.com/LCF2764/Discriminative-Speaker-Embedding.":[176]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":7},{"year":2022,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
