{"id":"https://openalex.org/W3149926570","doi":"https://doi.org/10.1109/slt48900.2021.9383579","title":"Supervised Attention for Speaker Recognition","display_name":"Supervised Attention for Speaker Recognition","publication_year":2021,"publication_date":"2021-01-19","ids":{"openalex":"https://openalex.org/W3149926570","doi":"https://doi.org/10.1109/slt48900.2021.9383579","mag":"3149926570"},"language":"en","primary_location":{"id":"doi:10.1109/slt48900.2021.9383579","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt48900.2021.9383579","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043877199","display_name":"Seong Min Kye","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Seong Min Kye","raw_affiliation_strings":["Korea Advanced Institute of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Korea Advanced Institute of Science and Technology","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038723822","display_name":"Joon Son Chung","orcid":"https://orcid.org/0000-0001-7741-7275"},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Joon Son Chung","raw_affiliation_strings":["Naver Corporation"],"affiliations":[{"raw_affiliation_string":"Naver Corporation","institution_ids":["https://openalex.org/I60922564"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066573438","display_name":"Hoirin Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hoirin Kim","raw_affiliation_strings":["Korea Advanced Institute of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Korea Advanced Institute of Science and Technology","institution_ids":["https://openalex.org/I157485424"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5043877199"],"corresponding_institution_ids":["https://openalex.org/I157485424"],"apc_list":null,"apc_paid":null,"fwci":1.2238,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.82829424,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":"14","issue":null,"first_page":"286","last_page":"293"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8181990385055542},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7793208360671997},{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.743963897228241},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6934076547622681},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6532582640647888},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6498117446899414},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.6022224426269531},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5263258814811707},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.5092283487319946},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4502035975456238},{"id":"https://openalex.org/keywords/feature-vector","display_name":"Feature vector","score":0.44849279522895813},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4078342616558075}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8181990385055542},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7793208360671997},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.743963897228241},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6934076547622681},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6532582640647888},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6498117446899414},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.6022224426269531},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5263258814811707},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.5092283487319946},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4502035975456238},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.44849279522895813},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4078342616558075},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt48900.2021.9383579","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt48900.2021.9383579","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7400000095367432,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W2101261946","https://openalex.org/W2121812409","https://openalex.org/W2123768812","https://openalex.org/W2150769028","https://openalex.org/W2395750323","https://openalex.org/W2406312423","https://openalex.org/W2601450892","https://openalex.org/W2726515241","https://openalex.org/W2748488820","https://openalex.org/W2784163702","https://openalex.org/W2794506738","https://openalex.org/W2808631503","https://openalex.org/W2890964092","https://openalex.org/W2916104401","https://openalex.org/W2940070181","https://openalex.org/W2951758756","https://openalex.org/W2963371159","https://openalex.org/W2963386851","https://openalex.org/W2963466847","https://openalex.org/W2969985801","https://openalex.org/W2972627751","https://openalex.org/W2972712416","https://openalex.org/W2972909277","https://openalex.org/W3013020904","https://openalex.org/W3014217528","https://openalex.org/W3020953549","https://openalex.org/W3025075133","https://openalex.org/W3025515949","https://openalex.org/W3044308976","https://openalex.org/W3048825472","https://openalex.org/W3096235116","https://openalex.org/W3097206043","https://openalex.org/W3097256596","https://openalex.org/W3103152812","https://openalex.org/W3147324749","https://openalex.org/W6713727690","https://openalex.org/W6735236233","https://openalex.org/W6781368565"],"related_works":["https://openalex.org/W2953234277","https://openalex.org/W2626256601","https://openalex.org/W147410782","https://openalex.org/W2900413183","https://openalex.org/W2529301793","https://openalex.org/W4390975304","https://openalex.org/W3148366653","https://openalex.org/W156213964","https://openalex.org/W2050960118","https://openalex.org/W129770839"],"abstract_inverted_index":{"The":[0],"recently":[1],"proposed":[2,98],"self-attentive":[3],"pooling":[4,54],"(SAP)":[5],"has":[6],"shown":[7],"good":[8],"performance":[9,131],"in":[10,57,69,84,119],"several":[11],"speaker":[12,43,126],"recognition":[13],"systems.":[14],"In":[15],"SAP":[16,47],"systems,":[17],"the":[18,26,30,38,46,51,63,81,90,107,133,137],"context":[19,33,91,100],"vector":[20,34,92,101],"is":[21,35,65],"trained":[22],"end-to-end":[23,70],"together":[24],"with":[25],"feature":[27],"extractor,":[28],"where":[29],"role":[31],"of":[32],"to":[36,50,105],"select":[37,106],"most":[39,108],"discriminative":[40],"frames":[41],"for":[42,79],"recognition.":[44],"However,":[45],"underperforms":[48],"compared":[49],"temporal":[52],"average":[53],"(TAP)":[55],"baseline":[56],"some":[58],"settings,":[59],"which":[60,88],"implies":[61],"that":[62,113],"attention":[64,82],"not":[66],"learnt":[67],"effectively":[68],"training.":[71],"To":[72],"tackle":[73],"this":[74],"problem,":[75],"we":[76],"introduce":[77],"strategies":[78],"training":[80],"mechanism":[83],"a":[85],"supervised":[86],"manner,":[87],"learns":[89],"using":[93],"classified":[94],"samples.":[95],"With":[96],"our":[97,114],"methods,":[99],"can":[102],"be":[103],"boosted":[104],"informative":[109],"frames.":[110],"We":[111],"show":[112],"method":[115],"outperforms":[116],"existing":[117,134],"methods":[118],"various":[120],"experimental":[121],"settings":[122],"including":[123],"short":[124],"utterance":[125],"recognition,":[127],"and":[128],"achieves":[129],"competitive":[130],"over":[132],"baselines":[135],"on":[136],"VoxCeleb":[138],"datasets.":[139]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
