{"id":"https://openalex.org/W4313048484","doi":"https://doi.org/10.1109/mmsp55362.2022.9948860","title":"DyViSE: Dynamic Vision-Guided Speaker Embedding for Audio-Visual Speaker Diarization","display_name":"DyViSE: Dynamic Vision-Guided Speaker Embedding for Audio-Visual Speaker Diarization","publication_year":2022,"publication_date":"2022-09-26","ids":{"openalex":"https://openalex.org/W4313048484","doi":"https://doi.org/10.1109/mmsp55362.2022.9948860"},"language":"en","primary_location":{"id":"doi:10.1109/mmsp55362.2022.9948860","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp55362.2022.9948860","pdf_url":null,"source":{"id":"https://openalex.org/S4363605768","display_name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023291127","display_name":"Abudukelimu Wuerkaixi","orcid":"https://orcid.org/0000-0002-5609-606X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Abudukelimu Wuerkaixi","raw_affiliation_strings":["Institute for Artificial Intelligence, Tsinghua University (THUAI),State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Beijing, P.R.China"],"affiliations":[{"raw_affiliation_string":"Institute for Artificial Intelligence, Tsinghua University (THUAI),State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","institution_ids":["https://openalex.org/I4210094879"]},{"raw_affiliation_string":"Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Beijing, P.R.China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080651684","display_name":"Kunda Yan","orcid":"https://orcid.org/0009-0005-9248-1159"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kunda Yan","raw_affiliation_strings":["Institute for Artificial Intelligence, Tsinghua University (THUAI),State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Beijing, P.R.China"],"affiliations":[{"raw_affiliation_string":"Institute for Artificial Intelligence, Tsinghua University (THUAI),State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","institution_ids":["https://openalex.org/I4210094879"]},{"raw_affiliation_string":"Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Beijing, P.R.China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100384568","display_name":"You Zhang","orcid":"https://orcid.org/0000-0002-4649-278X"},"institutions":[{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"You Zhang","raw_affiliation_strings":["Institute for Artificial Intelligence, Tsinghua University (THUAI),State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Beijing, P.R.China"],"affiliations":[{"raw_affiliation_string":"Institute for Artificial Intelligence, Tsinghua University (THUAI),State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","institution_ids":["https://openalex.org/I4210094879"]},{"raw_affiliation_string":"Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Beijing, P.R.China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102749436","display_name":"Zhiyao Duan","orcid":"https://orcid.org/0000-0002-8334-9974"},"institutions":[{"id":"https://openalex.org/I5388228","display_name":"University of Rochester","ror":"https://ror.org/022kthw22","country_code":"US","type":"education","lineage":["https://openalex.org/I5388228"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhiyao Duan","raw_affiliation_strings":["University of Rochester,Department of Electrical and Computer Engineering,Rochester,NY,USA","Department of Electrical and Computer Engineering, University of Rochester, Rochester, NY, USA"],"affiliations":[{"raw_affiliation_string":"University of Rochester,Department of Electrical and Computer Engineering,Rochester,NY,USA","institution_ids":["https://openalex.org/I5388228"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of Rochester, Rochester, NY, USA","institution_ids":["https://openalex.org/I5388228"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5065063835","display_name":"Changshui Zhang","orcid":"https://orcid.org/0000-0002-8088-367X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Changshui Zhang","raw_affiliation_strings":["Institute for Artificial Intelligence, Tsinghua University (THUAI),State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Beijing, P.R.China"],"affiliations":[{"raw_affiliation_string":"Institute for Artificial Intelligence, Tsinghua University (THUAI),State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","institution_ids":["https://openalex.org/I4210094879"]},{"raw_affiliation_string":"Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Beijing, P.R.China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5023291127"],"corresponding_institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.8591,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.75197472,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.8969849348068237},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8456882238388062},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.704922080039978},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.70354825258255},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6811522841453552},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5799564719200134},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5175649523735046},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5084025263786316},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.43195363879203796},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.41818171739578247},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.21554097533226013},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.11453402042388916}],"concepts":[{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.8969849348068237},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8456882238388062},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.704922080039978},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.70354825258255},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6811522841453552},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5799564719200134},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5175649523735046},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5084025263786316},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.43195363879203796},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.41818171739578247},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.21554097533226013},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.11453402042388916}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/mmsp55362.2022.9948860","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp55362.2022.9948860","pdf_url":null,"source":{"id":"https://openalex.org/S4363605768","display_name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.7300000190734863}],"awards":[{"id":"https://openalex.org/G4623234239","display_name":null,"funder_award_id":"1741472,DGE-1922591","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W1595802121","https://openalex.org/W1800068500","https://openalex.org/W1971901154","https://openalex.org/W2049893888","https://openalex.org/W2159094433","https://openalex.org/W2236062296","https://openalex.org/W2316138215","https://openalex.org/W2808631503","https://openalex.org/W2886232760","https://openalex.org/W2889418727","https://openalex.org/W2896538040","https://openalex.org/W2949662773","https://openalex.org/W2959214850","https://openalex.org/W2963307811","https://openalex.org/W2969985801","https://openalex.org/W2972729214","https://openalex.org/W2972949456","https://openalex.org/W3008357631","https://openalex.org/W3015445830","https://openalex.org/W3015841457","https://openalex.org/W3038871978","https://openalex.org/W3094831814","https://openalex.org/W3103322708","https://openalex.org/W3126757411","https://openalex.org/W3182657421","https://openalex.org/W3197331597","https://openalex.org/W3206008172","https://openalex.org/W3209984917","https://openalex.org/W3212886388","https://openalex.org/W4213448838","https://openalex.org/W4224925049","https://openalex.org/W4226419173","https://openalex.org/W4286378963","https://openalex.org/W4312319640","https://openalex.org/W6638440562","https://openalex.org/W6743555153","https://openalex.org/W6785851975","https://openalex.org/W6803164887","https://openalex.org/W6804225560","https://openalex.org/W6810459795","https://openalex.org/W6810587411"],"related_works":["https://openalex.org/W4292055372","https://openalex.org/W1992796048","https://openalex.org/W1828117201","https://openalex.org/W2103897043","https://openalex.org/W2160277484","https://openalex.org/W3148366653","https://openalex.org/W92500784","https://openalex.org/W1492025301","https://openalex.org/W2982817420","https://openalex.org/W4288029399"],"abstract_inverted_index":{"Speaker":[0],"diarization":[1,13],"aims":[2],"to":[3,19,47,93,107,118],"determine":[4],"\u201cwho":[5],"spoke":[6],"when\u201d":[7],"in":[8,17,97,110],"multi-speaker":[9],"scenarios.":[10],"Audio-visual":[11],"speaker":[12,31,83,95],"leverages":[14],"visual":[15,72,91],"information":[16,92,106],"addition":[18],"audio":[20,38,63,109],"signals":[21],"and":[22,39,42,114,148],"has":[23],"shown":[24],"improved":[25],"performance.":[26],"Existing":[27],"audio-visual":[28],"methods":[29],"extract":[30,94],"embeddings":[32,96],"for":[33,57,68,89,123],"each":[34,124],"video":[35],"clip":[36],"using":[37],"facial":[40,116],"features,":[41],"then":[43],"perform":[44],"clustering":[45,133],"according":[46],"their":[48],"similarity.":[49],"However,":[50],"this":[51,77],"approach":[52],"would":[53],"not":[54],"work":[55],"well":[56],"noisy":[58],"or":[59],"overlapped":[60],"speech":[61],"where":[62,71],"features":[64,73,117],"are":[65,74],"corrupted,":[66],"nor":[67],"off-screen":[69],"speakers":[70],"missing.":[75],"In":[76],"work,":[78],"we":[79],"propose":[80],"dynamic":[81,103],"vision-guided":[82],"embedding":[84,122],"(DyViSE),":[85],"a":[86,98,111,131],"novel":[87],"method":[88],"leveraging":[90],"multi-stage":[99],"system.":[100],"DyViSE":[101,127,140],"uses":[102],"lip":[104],"movement":[105],"denoise":[108],"latent":[112],"space":[113],"integrates":[115],"obtain":[119],"an":[120,137],"identity-discriminative":[121],"speaking":[125],"segment.":[126],"is":[128,154],"trained":[129],"with":[130,136],"deep":[132],"loss":[134],"along":[135],"exemplary":[138],"loss.":[139],"demonstrates":[141],"remarkable":[142],"performance":[143],"on":[144],"both":[145],"real-world":[146],"videos":[147],"artificially":[149],"assembled":[150],"videos.":[151],"Our":[152],"code":[153],"available":[155],"at":[156],"https://github.com/urkax/DyViSE.":[157]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
