{"id":"https://openalex.org/W4392904477","doi":"https://doi.org/10.1109/icassp48485.2024.10447899","title":"Large Scale Self-Supervised Pretraining for Active Speaker Detection","display_name":"Large Scale Self-Supervised Pretraining for Active Speaker Detection","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392904477","doi":"https://doi.org/10.1109/icassp48485.2024.10447899"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10447899","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447899","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048852285","display_name":"Otavio Braga","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Otavio Braga","raw_affiliation_strings":["Google, Inc"],"affiliations":[{"raw_affiliation_string":"Google, Inc","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101861591","display_name":"Wei Xia","orcid":"https://orcid.org/0009-0009-4734-6256"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wei Xia","raw_affiliation_strings":["Google, Inc"],"affiliations":[{"raw_affiliation_string":"Google, Inc","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103006745","display_name":"Keith Johnson","orcid":"https://orcid.org/0000-0002-2569-8121"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Keith Johnson","raw_affiliation_strings":["Google, Inc"],"affiliations":[{"raw_affiliation_string":"Google, Inc","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084283951","display_name":"Alice Z. Chuang","orcid":"https://orcid.org/0000-0002-9373-1624"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alice Chuang","raw_affiliation_strings":["Google, Inc"],"affiliations":[{"raw_affiliation_string":"Google, Inc","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007081982","display_name":"Yunfan Ye","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yunfan Ye","raw_affiliation_strings":["Google, Inc"],"affiliations":[{"raw_affiliation_string":"Google, Inc","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005881531","display_name":"Olivier Siohan","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Olivier Siohan","raw_affiliation_strings":["Google, Inc"],"affiliations":[{"raw_affiliation_string":"Google, Inc","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5024946787","display_name":"Tu\u1ea5n Anh Nguy\u1ec5n","orcid":"https://orcid.org/0000-0002-9123-3584"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tuan Anh Nguyen","raw_affiliation_strings":["Google, Inc"],"affiliations":[{"raw_affiliation_string":"Google, Inc","institution_ids":["https://openalex.org/I1291425158"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5048852285"],"corresponding_institution_ids":["https://openalex.org/I1291425158"],"apc_list":null,"apc_paid":null,"fwci":0.375,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.49544919,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"10036","last_page":"10040"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.8430050611495972},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.799757719039917},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5957759618759155},{"id":"https://openalex.org/keywords/scratch","display_name":"Scratch","score":0.5742982625961304},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5108106136322021},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5074617266654968},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.49034905433654785},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4712009131908417},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.41436392068862915},{"id":"https://openalex.org/keywords/supervised-learning","display_name":"Supervised learning","score":0.41423964500427246},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.06681731343269348}],"concepts":[{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.8430050611495972},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.799757719039917},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5957759618759155},{"id":"https://openalex.org/C2781235140","wikidata":"https://www.wikidata.org/wiki/Q275131","display_name":"Scratch","level":2,"score":0.5742982625961304},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5108106136322021},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5074617266654968},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.49034905433654785},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4712009131908417},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.41436392068862915},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.41423964500427246},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.06681731343269348},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C111368507","wikidata":"https://www.wikidata.org/wiki/Q43518","display_name":"Oceanography","level":1,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10447899","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447899","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W2033256038","https://openalex.org/W2112796928","https://openalex.org/W2146502635","https://openalex.org/W2531409750","https://openalex.org/W2963122170","https://openalex.org/W2963155035","https://openalex.org/W2972756321","https://openalex.org/W3006974783","https://openalex.org/W3016098309","https://openalex.org/W3016176723","https://openalex.org/W3160681319","https://openalex.org/W4224923231","https://openalex.org/W4250482878","https://openalex.org/W4289665794","https://openalex.org/W4293363567","https://openalex.org/W6681435938"],"related_works":["https://openalex.org/W2475116013","https://openalex.org/W2770018148","https://openalex.org/W2358308169","https://openalex.org/W2385135707","https://openalex.org/W2140315382","https://openalex.org/W2059109728","https://openalex.org/W322691623","https://openalex.org/W2494989134","https://openalex.org/W2509444723","https://openalex.org/W2004958254"],"abstract_inverted_index":{"In":[0],"this":[1],"work":[2],"we":[3,44,49],"investigate":[4],"the":[5,69],"impact":[6],"of":[7,23,27],"a":[8,33,53,77],"large-scale":[9],"self-supervised":[10],"pretraining":[11,48],"strategy":[12],"for":[13,64],"active":[14],"speaker":[15],"detection":[16],"(ASD)":[17],"on":[18,38,76],"an":[19],"unlabeled":[20],"dataset":[21,79],"consisting":[22],"over":[24],"125k":[25],"hours":[26],"YouTube":[28],"videos.":[29],"When":[30],"compared":[31],"to":[32,59],"baseline":[34],"trained":[35],"from":[36],"scratch":[37],"much":[39],"smaller":[40],"in-domain":[41],"labeled":[42],"datasets":[43],"show":[45],"that":[46],"with":[47,81],"not":[50],"only":[51],"have":[52],"more":[54],"stable":[55],"supervised":[56],"training":[57],"due":[58],"better":[60],"audio-visual":[61],"features":[62],"used":[63],"initialization,":[65],"but":[66],"also":[67],"improve":[68],"ASD":[70],"mean":[71],"average":[72],"precision":[73],"by":[74],"23%":[75],"challenging":[78],"collected":[80],"Google":[82],"Nest":[83],"Hub":[84],"Max":[85],"devices":[86],"capturing":[87],"real":[88],"user":[89],"interactions.":[90]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
