{"id":"https://openalex.org/W4312319640","doi":"https://doi.org/10.1109/mlsp55214.2022.9943352","title":"Rethinking Audio-Visual Synchronization for Active Speaker Detection","display_name":"Rethinking Audio-Visual Synchronization for Active Speaker Detection","publication_year":2022,"publication_date":"2022-08-22","ids":{"openalex":"https://openalex.org/W4312319640","doi":"https://doi.org/10.1109/mlsp55214.2022.9943352"},"language":"en","primary_location":{"id":"doi:10.1109/mlsp55214.2022.9943352","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mlsp55214.2022.9943352","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE 32nd International Workshop on Machine Learning for Signal Processing (MLSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023291127","display_name":"Abudukelimu Wuerkaixi","orcid":"https://orcid.org/0000-0002-5609-606X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Abudukelimu Wuerkaixi","raw_affiliation_strings":["Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University,State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University, Beijing, P.R.China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University,State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University, Beijing, P.R.China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100384568","display_name":"You Zhang","orcid":"https://orcid.org/0000-0002-4649-278X"},"institutions":[{"id":"https://openalex.org/I5388228","display_name":"University of Rochester","ror":"https://ror.org/022kthw22","country_code":"US","type":"education","lineage":["https://openalex.org/I5388228"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"You Zhang","raw_affiliation_strings":["University of Rochester,Department of Electrical and Computer Engineering,Rochester,NY,USA","Department of Electrical and Computer Engineering, University of Rochester, Rochester, NY, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Rochester,Department of Electrical and Computer Engineering,Rochester,NY,USA","institution_ids":["https://openalex.org/I5388228"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of Rochester, Rochester, NY, USA","institution_ids":["https://openalex.org/I5388228"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102749436","display_name":"Zhiyao Duan","orcid":"https://orcid.org/0000-0002-8334-9974"},"institutions":[{"id":"https://openalex.org/I5388228","display_name":"University of Rochester","ror":"https://ror.org/022kthw22","country_code":"US","type":"education","lineage":["https://openalex.org/I5388228"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhiyao Duan","raw_affiliation_strings":["University of Rochester,Department of Electrical and Computer Engineering,Rochester,NY,USA","Department of Electrical and Computer Engineering, University of Rochester, Rochester, NY, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Rochester,Department of Electrical and Computer Engineering,Rochester,NY,USA","institution_ids":["https://openalex.org/I5388228"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of Rochester, Rochester, NY, USA","institution_ids":["https://openalex.org/I5388228"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5065063835","display_name":"Changshui Zhang","orcid":"https://orcid.org/0000-0002-8088-367X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Changshui Zhang","raw_affiliation_strings":["Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University,State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University, Beijing, P.R.China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University,State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist),Department of Automation,Beijing,P.R.China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Department of Automation, State Key Lab of Intelligent Technologies and Systems, Beijing National Research Center for Information Science and Technology (BNRist), Institute for Artificial Intelligence, Tsinghua University (THUAI), Tsinghua University, Beijing, P.R.China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.3785,"has_fulltext":false,"cited_by_count":18,"citation_normalized_percentile":{"value":0.89121374,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"01","last_page":"06"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7566654682159424},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7096978425979614},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.6965502500534058},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6107974052429199},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.5804886221885681},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.47367244958877563},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4580281376838684},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.17409437894821167},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.10212555527687073}],"concepts":[{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7566654682159424},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7096978425979614},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.6965502500534058},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6107974052429199},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.5804886221885681},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.47367244958877563},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4580281376838684},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.17409437894821167},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.10212555527687073},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/mlsp55214.2022.9943352","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mlsp55214.2022.9943352","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE 32nd International Workshop on Machine Learning for Signal Processing (MLSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7900000214576721}],"awards":[{"id":"https://openalex.org/G5157521015","display_name":"BIGDATA: F: Audio-Visual Scene Understanding","funder_award_id":"1741472","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2001183000","https://openalex.org/W2154636774","https://openalex.org/W2194775991","https://openalex.org/W2604379605","https://openalex.org/W2808631503","https://openalex.org/W2936361916","https://openalex.org/W2962960500","https://openalex.org/W3015841457","https://openalex.org/W3016098309","https://openalex.org/W3035661013","https://openalex.org/W3038871978","https://openalex.org/W3047985687","https://openalex.org/W3169351047","https://openalex.org/W3172472082","https://openalex.org/W3173382920","https://openalex.org/W3181255780","https://openalex.org/W3186700381","https://openalex.org/W3189964604","https://openalex.org/W3194371751","https://openalex.org/W3207207922","https://openalex.org/W3211862173","https://openalex.org/W4287023914","https://openalex.org/W4289665794","https://openalex.org/W4308663722","https://openalex.org/W4385245566","https://openalex.org/W6739901393","https://openalex.org/W6798523648","https://openalex.org/W6803359641","https://openalex.org/W6810980340"],"related_works":["https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W3137890128","https://openalex.org/W1984634519","https://openalex.org/W4245955731","https://openalex.org/W2393726419"],"abstract_inverted_index":{"Active":[0],"speaker":[1],"detection":[2],"(ASD)":[3],"systems":[4],"are":[5,20,147],"important":[6],"modules":[7,109],"for":[8,110],"analyzing":[9],"multi-talker":[10],"conversations.":[11],"They":[12],"aim":[13],"to":[14,114],"detect":[15,127],"which":[16,71],"speakers":[17],"or":[18],"none":[19],"talking":[21],"in":[22,47,79,107],"a":[23,98],"visual":[24,57],"scene":[25],"at":[26,149],"any":[27],"given":[28],"time.":[29],"Existing":[30],"research":[31],"on":[32,37],"ASD":[33,76,112],"does":[34],"not":[35,131],"agree":[36],"the":[38,45,54,81,116,134],"definition":[39,46,63],"of":[40,62,136],"active":[41,90],"speakers.":[42],"We":[43],"clarify":[44],"this":[48,94],"work":[49],"and":[50,56,84,103],"require":[51],"synchronization":[52,83,117],"between":[53],"audio":[55],"speaking":[58,129],"activities.":[59],"This":[60],"clarification":[61],"is":[64],"motivated":[65],"by":[66],"our":[67,123],"extensive":[68],"experiments,":[69],"through":[70],"we":[72,96],"discover":[73],"that":[74,122],"existing":[75],"methods":[77],"fail":[78],"modeling":[80],"audio-visual":[82],"often":[85],"classify":[86],"unsynchronized":[87],"videos":[88],"as":[89,130],"speaking.":[91],"To":[92],"address":[93],"problem,":[95],"propose":[97],"cross-modal":[99],"contrastive":[100],"learning":[101],"strategy":[102],"apply":[104],"positional":[105],"encoding":[106],"attention":[108],"supervised":[111],"models":[113,138],"leverage":[115],"cue.":[118],"Experimental":[119],"results":[120],"suggest":[121],"model":[124],"can":[125],"successfully":[126],"unsyn-chronized":[128],"speaking,":[132],"addressing":[133],"limitation":[135],"current":[137],"<sup":[139,142],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[140,143],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[141,144],"The":[145],"codeanddemo":[146],"available":[148],"https://qithub.com/urkax/SyncTalkNet..":[150]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1}],"updated_date":"2026-06-22T08:00:12.763002","created_date":"2025-10-10T00:00:00"}
