{"id":"https://openalex.org/W4405778816","doi":"https://doi.org/10.1109/tmm.2024.3521791","title":"Enhancing Real-World Active Speaker Detection With Multi-Modal Extraction Pre-Training","display_name":"Enhancing Real-World Active Speaker Detection With Multi-Modal Extraction Pre-Training","publication_year":2024,"publication_date":"2024-12-25","ids":{"openalex":"https://openalex.org/W4405778816","doi":"https://doi.org/10.1109/tmm.2024.3521791"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2024.3521791","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2024.3521791","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026034735","display_name":"Ruijie Tao","orcid":"https://orcid.org/0000-0003-0021-5661"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Ruijie Tao","raw_affiliation_strings":["Department of Electrical and Computer Engineering, National University of Singapore, Singapore, Singapore","Department of Electrical and Computer Engineering, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056495776","display_name":"Xinyuan Qian","orcid":"https://orcid.org/0000-0002-9511-6713"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinyuan Qian","raw_affiliation_strings":["Department of Computer Science and Technology, University of Science and Technology Beijing, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029637694","display_name":"Rohan Kumar Das","orcid":"https://orcid.org/0000-0002-1332-3357"},"institutions":[{"id":"https://openalex.org/I79891267","display_name":"Singapore Management University","ror":"https://ror.org/050qmg959","country_code":"SG","type":"education","lineage":["https://openalex.org/I79891267"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Rohan Kumar Das","raw_affiliation_strings":["Fortemedia, Singapore, Singapore","Fortemedia, Singapore"],"affiliations":[{"raw_affiliation_string":"Fortemedia, Singapore, Singapore","institution_ids":["https://openalex.org/I79891267"]},{"raw_affiliation_string":"Fortemedia, Singapore","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101856962","display_name":"Xiaoxue Gao","orcid":"https://orcid.org/0000-0003-1920-5228"},"institutions":[{"id":"https://openalex.org/I115228651","display_name":"Agency for Science, Technology and Research","ror":"https://ror.org/036wvzt09","country_code":"SG","type":"government","lineage":["https://openalex.org/I115228651"]},{"id":"https://openalex.org/I3005327000","display_name":"Institute for Infocomm Research","ror":"https://ror.org/053rfa017","country_code":"SG","type":"facility","lineage":["https://openalex.org/I115228651","https://openalex.org/I3005327000","https://openalex.org/I91275662"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Xiaoxue Gao","raw_affiliation_strings":["Institute for Infocomm Research, Singapore, Singapore","Institute for Infocomm Research, A*STAR, Singapore"],"affiliations":[{"raw_affiliation_string":"Institute for Infocomm Research, Singapore, Singapore","institution_ids":["https://openalex.org/I3005327000"]},{"raw_affiliation_string":"Institute for Infocomm Research, A*STAR, Singapore","institution_ids":["https://openalex.org/I3005327000","https://openalex.org/I115228651"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100663491","display_name":"Jiadong Wang","orcid":"https://orcid.org/0000-0001-9372-3133"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Jiadong Wang","raw_affiliation_strings":["Department of Electrical and Computer Engineering, National University of Singapore, Singapore, Singapore","Department of Electrical and Computer Engineering, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032690182","display_name":"Haizhou Li","orcid":null},"institutions":[{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haizhou Li","raw_affiliation_strings":["Shenzhen Research Institute of Big data, School of Data Science, The Chinese University of Hong Kong, Shenzhen, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen Research Institute of Big data, School of Data Science, The Chinese University of Hong Kong, Shenzhen, Guangdong, China","institution_ids":["https://openalex.org/I4210116924"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5026034735"],"corresponding_institution_ids":["https://openalex.org/I165932596"],"apc_list":null,"apc_paid":null,"fwci":1.366,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.85173294,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":"27","issue":null,"first_page":"2362","last_page":"2373"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.989300012588501,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.989300012588501,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9710000157356262,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8470172882080078},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5708129405975342},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5543535947799683},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5368040204048157},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.4614732265472412},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.45266658067703247},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.44026902318000793}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8470172882080078},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5708129405975342},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5543535947799683},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5368040204048157},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.4614732265472412},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.45266658067703247},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44026902318000793},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2024.3521791","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2024.3521791","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2799040894","display_name":null,"funder_award_id":"62271432","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":55,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2044271574","https://openalex.org/W2082183045","https://openalex.org/W2121486117","https://openalex.org/W2194775991","https://openalex.org/W2323437298","https://openalex.org/W2330149154","https://openalex.org/W2604379605","https://openalex.org/W2696967604","https://openalex.org/W2796417745","https://openalex.org/W2808631503","https://openalex.org/W2890952074","https://openalex.org/W2896457183","https://openalex.org/W2964058413","https://openalex.org/W2964171275","https://openalex.org/W2971784756","https://openalex.org/W2972513594","https://openalex.org/W2973062255","https://openalex.org/W3008003372","https://openalex.org/W3008400075","https://openalex.org/W3010594275","https://openalex.org/W3015199127","https://openalex.org/W3016098309","https://openalex.org/W3034702511","https://openalex.org/W3041847644","https://openalex.org/W3094502228","https://openalex.org/W3116298410","https://openalex.org/W3119269912","https://openalex.org/W3163287738","https://openalex.org/W3169351047","https://openalex.org/W3176404283","https://openalex.org/W3189964604","https://openalex.org/W3197584784","https://openalex.org/W3203209821","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4214701094","https://openalex.org/W4225302874","https://openalex.org/W4286378963","https://openalex.org/W4289665794","https://openalex.org/W4292793901","https://openalex.org/W4295308317","https://openalex.org/W4312268709","https://openalex.org/W4312319640","https://openalex.org/W4312367758","https://openalex.org/W4312466984","https://openalex.org/W4313156423","https://openalex.org/W4365807578","https://openalex.org/W4366548962","https://openalex.org/W4385245566","https://openalex.org/W4385822320","https://openalex.org/W4386075783","https://openalex.org/W4392908937","https://openalex.org/W4393147127","https://openalex.org/W4402754289"],"related_works":["https://openalex.org/W1491159402","https://openalex.org/W4297807400","https://openalex.org/W2249138175","https://openalex.org/W4313854686","https://openalex.org/W3162054169","https://openalex.org/W1813780412","https://openalex.org/W289407349","https://openalex.org/W2368768466","https://openalex.org/W2757081366","https://openalex.org/W3197877226"],"abstract_inverted_index":{"Audio-visual":[0],"active":[1],"speaker":[2],"detection":[3],"(AV-ASD)":[4],"aims":[5],"to":[6,44,94,108],"identify":[7],"which":[8,86],"visible":[9],"face":[10],"is":[11,31,87,101,124],"speaking":[12],"in":[13,35,51,178],"a":[14,32,57,79],"scene":[15],"with":[16,89,103,116],"one":[17],"or":[18],"more":[19],"persons.":[20],"Most":[21],"existing":[22],"AV-ASD":[23,41,54,105,147,169],"methods":[24,148],"prioritize":[25],"capturing":[26],"speech-lip":[27],"correspondence.":[28],"However,":[29],"there":[30],"noticeable":[33],"gap":[34],"addressing":[36],"the":[37,45,96,104,111,127,133,145,154,160,167,176],"challenges":[38],"from":[39,70],"real-world":[40,117],"scenarios.":[42],"Due":[43],"presence":[46],"of":[47,63],"low-quality":[48],"noisy":[49],"videos":[50],"such":[52,119],"cases,":[53],"systems":[55],"without":[56],"selective":[58],"listening":[59],"ability":[60],"are":[61],"short":[62],"effectively":[64],"filtering":[65],"out":[66],"disruptive":[67],"voice":[68],"components":[69],"mixed":[71],"audio":[72],"inputs.":[73],"In":[74],"this":[75],"paper,":[76],"we":[77],"propose":[78],"Multi-modal":[80],"Speech":[81],"Extraction-to-Detection":[82],"framework":[83],"named":[84],"\u2018MuSED\u2019,":[85],"pre-trained":[88],"audio-visual":[90],"target":[91],"speech":[92],"extraction":[93],"learn":[95],"denoising":[97],"ability,":[98],"then":[99],"it":[100],"fine-tuned":[102],"task.":[106],"Meanwhile,":[107],"better":[109],"capture":[110],"multi-modal":[112,134],"information":[113],"and":[114,131,149,163],"deal":[115],"problems":[118],"as":[120],"missing":[121],"modality,":[122],"MuSED":[123,142],"modelled":[125],"on":[126,153,159,166],"time":[128],"domain":[129],"directly":[130],"integrates":[132],"plus-and-minus":[135],"augmentation":[136],"strategy.":[137],"Our":[138],"experiments":[139],"demonstrate":[140],"that":[141],"substantially":[143],"outperforms":[144],"state-of-the-art":[146],"achieves":[150],"95.6%":[151],"mAP":[152],"AVA-ActiveSpeaker":[155],"dataset,":[156,162,170],"98.3%":[157],"AP":[158],"ASW":[161],"97.9%":[164],"F1":[165],"Columbia":[168],"respectively.":[171],"We":[172],"will":[173],"publicly":[174],"release":[175],"code":[177],"due":[179],"course.":[180]},"counts_by_year":[{"year":2025,"cited_by_count":4}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
