{"id":"https://openalex.org/W4401212605","doi":"https://doi.org/10.1145/3653876.3653878","title":"Breaking Barriers with Enhanced DINO Framework and Score Normalization to Self-Supervised Speaker Verification","display_name":"Breaking Barriers with Enhanced DINO Framework and Score Normalization to Self-Supervised Speaker Verification","publication_year":2024,"publication_date":"2024-02-23","ids":{"openalex":"https://openalex.org/W4401212605","doi":"https://doi.org/10.1145/3653876.3653878"},"language":"en","primary_location":{"id":"doi:10.1145/3653876.3653878","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3653876.3653878","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 8th International Conference on Digital Signal Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010423458","display_name":"Xianmei Wan","orcid":"https://orcid.org/0009-0006-9266-5000"},"institutions":[{"id":"https://openalex.org/I105126617","display_name":"Zhejiang International Studies University","ror":"https://ror.org/01vwvvq12","country_code":"CN","type":"education","lineage":["https://openalex.org/I105126617"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xianmei Wan","raw_affiliation_strings":["School of International Business, School of Innovation and Entrepreneurship, Zhejiang International Studies University, China"],"raw_orcid":"https://orcid.org/0009-0006-9266-5000","affiliations":[{"raw_affiliation_string":"School of International Business, School of Innovation and Entrepreneurship, Zhejiang International Studies University, China","institution_ids":["https://openalex.org/I105126617"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101886203","display_name":"Xiaosi Zhan","orcid":"https://orcid.org/0000-0002-4640-628X"},"institutions":[{"id":"https://openalex.org/I105126617","display_name":"Zhejiang International Studies University","ror":"https://ror.org/01vwvvq12","country_code":"CN","type":"education","lineage":["https://openalex.org/I105126617"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaosi Zhan","raw_affiliation_strings":["School of International Business, School of Innovation and Entrepreneurship, Zhejiang International Studies University, China"],"raw_orcid":"https://orcid.org/0000-0002-4640-628X","affiliations":[{"raw_affiliation_string":"School of International Business, School of Innovation and Entrepreneurship, Zhejiang International Studies University, China","institution_ids":["https://openalex.org/I105126617"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101259000","display_name":"Na Li","orcid":"https://orcid.org/0009-0009-3824-2973"},"institutions":[{"id":"https://openalex.org/I105126617","display_name":"Zhejiang International Studies University","ror":"https://ror.org/01vwvvq12","country_code":"CN","type":"education","lineage":["https://openalex.org/I105126617"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Na Li","raw_affiliation_strings":["School of International Business, School of Innovation and Entrepreneurship, Zhejiang International Studies University, China"],"raw_orcid":"https://orcid.org/0009-0009-3824-2973","affiliations":[{"raw_affiliation_string":"School of International Business, School of Innovation and Entrepreneurship, Zhejiang International Studies University, China","institution_ids":["https://openalex.org/I105126617"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5054618632","display_name":"Guihua Liao","orcid":"https://orcid.org/0009-0000-5366-1804"},"institutions":[{"id":"https://openalex.org/I105126617","display_name":"Zhejiang International Studies University","ror":"https://ror.org/01vwvvq12","country_code":"CN","type":"education","lineage":["https://openalex.org/I105126617"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guihua Liao","raw_affiliation_strings":["School of International Business, School of Innovation and Entrepreneurship, Zhejiang International Studies University, China"],"raw_orcid":"https://orcid.org/0009-0000-5366-1804","affiliations":[{"raw_affiliation_string":"School of International Business, School of Innovation and Entrepreneurship, Zhejiang International Studies University, China","institution_ids":["https://openalex.org/I105126617"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5010423458"],"corresponding_institution_ids":["https://openalex.org/I105126617"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.09950888,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"158","last_page":"164"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.8708630800247192},{"id":"https://openalex.org/keywords/speaker-verification","display_name":"Speaker verification","score":0.7816963195800781},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7114919424057007},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43901723623275757},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.42370277643203735},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4137110114097595},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.33154869079589844},{"id":"https://openalex.org/keywords/sociology","display_name":"Sociology","score":0.0866234302520752}],"concepts":[{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.8708630800247192},{"id":"https://openalex.org/C2982762665","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker verification","level":3,"score":0.7816963195800781},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7114919424057007},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43901723623275757},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42370277643203735},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4137110114097595},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.33154869079589844},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0866234302520752},{"id":"https://openalex.org/C19165224","wikidata":"https://www.wikidata.org/wiki/Q23404","display_name":"Anthropology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3653876.3653878","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3653876.3653878","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 8th International Conference on Digital Signal Processing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W5064682","https://openalex.org/W1946987309","https://openalex.org/W2044812127","https://openalex.org/W2078953162","https://openalex.org/W2101556109","https://openalex.org/W2194775991","https://openalex.org/W2696967604","https://openalex.org/W2726515241","https://openalex.org/W2747165665","https://openalex.org/W2794506738","https://openalex.org/W2802973008","https://openalex.org/W2888908158","https://openalex.org/W2890964092","https://openalex.org/W2916104401","https://openalex.org/W2963466847","https://openalex.org/W3005680577","https://openalex.org/W3013020904","https://openalex.org/W3024869864","https://openalex.org/W3035060554","https://openalex.org/W3035524453","https://openalex.org/W3147482759","https://openalex.org/W3159481202","https://openalex.org/W3161606033","https://openalex.org/W3166898278","https://openalex.org/W3205635414","https://openalex.org/W4200633018","https://openalex.org/W4221159588","https://openalex.org/W4292976050","https://openalex.org/W4299500092","https://openalex.org/W4313590886","https://openalex.org/W4365460940","https://openalex.org/W4372346385","https://openalex.org/W4385493054"],"related_works":["https://openalex.org/W1968216131","https://openalex.org/W2355011896","https://openalex.org/W3089964815","https://openalex.org/W1581134722","https://openalex.org/W66821593","https://openalex.org/W4297807400","https://openalex.org/W1491159402","https://openalex.org/W4313854686","https://openalex.org/W1516392727","https://openalex.org/W2140022733"],"abstract_inverted_index":{"Training":[0],"robust":[1,58],"speaker":[2,6,63,90,120],"verification":[3,64,121],"systems":[4],"without":[5],"labels":[7],"has":[8,17],"long":[9],"posed":[10],"a":[11,32],"significant":[12],"challenge.":[13],"Self-supervised":[14],"learning":[15,52],"(SSL)":[16],"garnered":[18],"increased":[19],"attention":[20],"in":[21,88,110,118],"the":[22,55,73,77,98,102],"field":[23],"of":[24,57,76,94,104],"speech":[25],"processing.":[26],"However,":[27],"prior":[28],"researches":[29],"have":[30],"revealed":[31],"notable":[33],"performance":[34,87],"gap":[35],"between":[36],"self-supervised":[37,51,74,89,106,119],"and":[38],"fully":[39],"supervised":[40],"methods.":[41],"This":[42],"paper":[43],"introduces":[44],"an":[45],"adaptive":[46,68],"score":[47,69,107],"normalization":[48,70,108],"approach":[49],"for":[50,61],"frameworks,":[53],"enabling":[54],"establishment":[56],"decision":[59],"thresholds":[60],"practical":[62],"systems.":[65],"By":[66],"incorporating":[67],"techniques":[71],"into":[72],"framework":[75],"enhanced":[78,111],"DINO":[79,112],"(self-DIstillation":[80],"with":[81],"NO":[82],"labels),":[83],"it":[84],"achieves":[85],"superior":[86],"verification.":[91],"A":[92],"series":[93],"experiments":[95],"conducted":[96],"on":[97,122],"VoxCeleb":[99],"datasets":[100],"demonstrates":[101],"efficacy":[103],"our":[105],"algorithm":[109],"framework,":[113],"leading":[114],"to":[115],"state-of-the-art":[116],"results":[117],"VoxCeleb.":[123]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
