{"id":"https://openalex.org/W4415433903","doi":"https://doi.org/10.21437/interspeech.2025-628","title":"Rethinking Leveraging Pre-Trained Multi-Layer Representations for Speaker Verification","display_name":"Rethinking Leveraging Pre-Trained Multi-Layer Representations for Speaker Verification","publication_year":2025,"publication_date":"2025-08-17","ids":{"openalex":"https://openalex.org/W4415433903","doi":"https://doi.org/10.21437/interspeech.2025-628"},"language":null,"primary_location":{"id":"doi:10.21437/interspeech.2025-628","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2025-628","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2512.22148","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046518803","display_name":"Jin Sob Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jin Sob Kim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103110752","display_name":"Hyun Joon Park","orcid":"https://orcid.org/0000-0002-5308-6675"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hyun Joon Park","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001866594","display_name":"Woo-Seok Shin","orcid":"https://orcid.org/0000-0002-8475-4795"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wooseok Shin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5024995454","display_name":"Sung Won Han","orcid":"https://orcid.org/0000-0002-0040-3542"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sung Won Han","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5046518803"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":6.5456,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.9659076,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"3713","last_page":"3717"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9757999777793884,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9401999711990356,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.7678999900817871},{"id":"https://openalex.org/keywords/weighting","display_name":"Weighting","score":0.6111000180244446},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5253000259399414},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.48159998655319214},{"id":"https://openalex.org/keywords/speaker-verification","display_name":"Speaker verification","score":0.4814000129699707},{"id":"https://openalex.org/keywords/layer","display_name":"Layer (electronics)","score":0.45509999990463257}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7796000242233276},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.7678999900817871},{"id":"https://openalex.org/C183115368","wikidata":"https://www.wikidata.org/wiki/Q856577","display_name":"Weighting","level":2,"score":0.6111000180244446},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5616999864578247},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5253000259399414},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.48159998655319214},{"id":"https://openalex.org/C2982762665","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker verification","level":3,"score":0.4814000129699707},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4717999994754791},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.45509999990463257},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.44609999656677246},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.42899999022483826},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.35370001196861267},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3402000069618225},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.27230000495910645},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.267300009727478},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2624000012874603}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.21437/interspeech.2025-628","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2025-628","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2025","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2512.22148","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.22148","pdf_url":"https://arxiv.org/pdf/2512.22148","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2512.22148","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.22148","pdf_url":"https://arxiv.org/pdf/2512.22148","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"speaker":[1,50,76,88,125],"verification":[2],"studies":[3],"have":[4,18],"achieved":[5],"notable":[6],"success":[7],"by":[8],"leveraging":[9],"layer-wise":[10],"output":[11],"from":[12,45,59,90],"pre-trained":[13,46,91],"Transformer":[14],"models.":[15],"However,":[16],"few":[17],"explored":[19],"the":[20,28,54,96,110],"advancements":[21],"in":[22],"aggregating":[23,42],"these":[24],"multi-level":[25],"features":[26],"beyond":[27],"static":[29],"weighted":[30],"average.":[31],"We":[32,113],"present":[33],"Layer":[34],"Attentive":[35,81],"Pooling":[36,84],"(LAP),":[37],"a":[38,73],"novel":[39],"strategy":[40],"for":[41,49,123],"inter-layer":[43],"representations":[44],"speech":[47],"models":[48],"verification.":[51],"LAP":[52,79,116],"assesses":[53],"significance":[55],"of":[56,68],"each":[57],"layer":[58],"multiple":[60],"perspectives":[61],"time-dynamically,":[62],"and":[63,80,118],"employs":[64],"max":[65],"pooling":[66],"instead":[67],"averaging.":[69],"Additionally,":[70],"we":[71],"propose":[72],"lightweight":[74],"backend":[75],"model":[77,92],"comprising":[78],"Statistical":[82],"Temporal":[83],"(ASTP)":[85],"to":[86],"extract":[87],"embeddings":[89],"output.":[93],"Experiments":[94],"on":[95],"VoxCeleb":[97],"benchmark":[98],"reveal":[99],"that":[100],"our":[101],"compact":[102],"architecture":[103],"achieves":[104],"state-of-the-art":[105],"performance":[106],"while":[107],"greatly":[108],"reducing":[109],"training":[111],"time.":[112],"further":[114],"analyzed":[115],"design":[117],"its":[119],"dynamic":[120],"weighting":[121],"mechanism":[122],"capturing":[124],"characteristics.":[126]},"counts_by_year":[{"year":2026,"cited_by_count":3}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-24T00:00:00"}
