{"id":"https://openalex.org/W3189964604","doi":"https://doi.org/10.1145/3474085.3475275","title":"UniCon: Unified Context Network for Robust Active Speaker Detection","display_name":"UniCon: Unified Context Network for Robust Active Speaker Detection","publication_year":2021,"publication_date":"2021-10-17","ids":{"openalex":"https://openalex.org/W3189964604","doi":"https://doi.org/10.1145/3474085.3475275","mag":"3189964604"},"language":"en","primary_location":{"id":"doi:10.1145/3474085.3475275","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3474085.3475275","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2108.02607","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yuanhang Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yuanhang Zhang","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Susan Liang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Susan Liang","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Shuang Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuang Yang","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xiao Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao Liu","raw_affiliation_strings":["Tomorrow Advancing Life, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tomorrow Advancing Life, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhongqin Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhongqin Wu","raw_affiliation_strings":["Tomorrow Advancing Life, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tomorrow Advancing Life, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Shiguang Shan","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiguang Shan","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]}]},{"author_position":"last","author":{"id":null,"display_name":"Xilin Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xilin Chen","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":4.3221,"has_fulltext":false,"cited_by_count":38,"citation_normalized_percentile":{"value":0.95304756,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"3964","last_page":"3972"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9940000176429749,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.7222999930381775},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6711000204086304},{"id":"https://openalex.org/keywords/aggregate","display_name":"Aggregate (composite)","score":0.5152999758720398},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5080000162124634},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4287000000476837},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.4262000024318695},{"id":"https://openalex.org/keywords/spatial-contextual-awareness","display_name":"Spatial contextual awareness","score":0.39500001072883606},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3831000030040741}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7809000015258789},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.7222999930381775},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6711000204086304},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5796999931335449},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.5152999758720398},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5080000162124634},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4332999885082245},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4287000000476837},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.4262000024318695},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.39500001072883606},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3831000030040741},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.37389999628067017},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.3625999987125397},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.3578000068664551},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.3515999913215637},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.33559998869895935},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.33500000834465027},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3262999951839447},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.32190001010894775},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.3091000020503998},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.3052999973297119},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.25279998779296875}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3474085.3475275","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3474085.3475275","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2108.02607","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2108.02607","pdf_url":"https://arxiv.org/pdf/2108.02607","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2108.02607","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2108.02607","pdf_url":"https://arxiv.org/pdf/2108.02607","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W58692019","https://openalex.org/W1562651896","https://openalex.org/W1677182931","https://openalex.org/W2048447221","https://openalex.org/W2054852564","https://openalex.org/W2076029968","https://openalex.org/W2296073425","https://openalex.org/W2316138215","https://openalex.org/W2525932165","https://openalex.org/W2808631503","https://openalex.org/W2991853946","https://openalex.org/W3015222335","https://openalex.org/W3015598461","https://openalex.org/W3035875334","https://openalex.org/W3038871978","https://openalex.org/W3041847644","https://openalex.org/W3119269912","https://openalex.org/W3157160370","https://openalex.org/W4289665794"],"related_works":[],"abstract_inverted_index":{"We":[0],"propose":[1],"a":[2,57,122,152],"new":[3],"efficient":[4],"framework,":[5],"the":[6,35,38,75,87,91,149,173,188,195,203],"Unified":[7],"Context":[8],"Network":[9],"(UniCon),":[10],"for":[11,19,125,194],"robust":[12,126],"active":[13],"speaker":[14],"detection":[15],"(ASD).":[16],"Traditional":[17],"methods":[18],"ASD":[20,139],"usually":[21],"operate":[22],"on":[23,63,113,136,163,187,198],"each":[24,80,98],"candidate's":[25,81],"pre-cropped":[26],"face":[27],"track":[28],"separately":[29],"and":[30,77,93,100,107,127,172],"do":[31],"not":[32],"sufficiently":[33],"consider":[34],"relationships":[36,89],"among":[37,90],"candidates.":[39],"This":[40],"potentially":[41],"limits":[42],"performance,":[43],"especially":[44],"in":[45,121],"challenging":[46,138,165,200],"scenarios":[47],"with":[48,97,168,175],"low-resolution":[49],"faces,":[50],"multiple":[51,66],"candidates,":[52],"etc.":[53],"Our":[54],"solution":[55],"is":[56,134],"novel,":[58],"unified":[59,123],"framework":[60],"that":[61],"focuses":[62],"jointly":[64],"modeling":[65],"types":[67],"of":[68,79,155,205],"contextual":[69],"information:":[70],"spatial":[71],"context":[72,84,102],"to":[73,85,103],"indicate":[74],"position":[76],"scale":[78],"face,":[82],"relational":[83],"capture":[86],"visual":[88],"candidates":[92,120],"contrast":[94],"audio-visual":[95],"affinities":[96],"other,":[99],"temporal":[101],"aggregate":[104],"long-term":[105],"information":[106],"smooth":[108],"out":[109],"local":[110],"uncertainties.":[111],"Based":[112],"such":[114],"information,":[115],"our":[116,146,182],"model":[117],"optimizes":[118],"all":[119],"process":[124],"reliable":[128],"ASD.":[129],"A":[130],"thorough":[131],"ablation":[132],"study":[133],"performed":[135],"several":[137],"benchmarks":[140],"under":[141],"different":[142],"settings.":[143],"In":[144],"particular,":[145],"method":[147],"outperforms":[148],"state-of-the-art":[150],"by":[151],"large":[153],"margin":[154],"about":[156],"15%":[157],"mean":[158],"Average":[159],"Precision":[160],"(mAP)":[161],"absolute":[162],"two":[164],"subsets:":[166],"one":[167],"three":[169],"candidate":[170],"speakers,":[171],"other":[174],"faces":[176],"smaller":[177],"than":[178],"64":[179],"pixels.":[180],"Together,":[181],"UniCon":[183],"achieves":[184],"92.0%":[185],"mAP":[186],"AVA-ActiveSpeaker":[189],"validation":[190],"set,":[191],"surpassing":[192],"90%":[193],"first":[196],"time":[197,204],"this":[199],"dataset":[201],"at":[202],"submission.":[206],"Project":[207],"website:":[208],"https://unicon-asd.github.io/.":[209]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":15},{"year":2023,"cited_by_count":8},{"year":2022,"cited_by_count":5}],"updated_date":"2026-05-07T13:39:58.223016","created_date":"2021-08-16T00:00:00"}
