{"id":"https://openalex.org/W3213716738","doi":"https://doi.org/10.1109/mmsp53017.2021.9733678","title":"Visually Supervised Speaker Detection and Localization via Microphone Array","display_name":"Visually Supervised Speaker Detection and Localization via Microphone Array","publication_year":2021,"publication_date":"2021-10-06","ids":{"openalex":"https://openalex.org/W3213716738","doi":"https://doi.org/10.1109/mmsp53017.2021.9733678","mag":"3213716738"},"language":"en","primary_location":{"id":"doi:10.1109/mmsp53017.2021.9733678","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp53017.2021.9733678","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE 23rd International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5005679652","display_name":"Davide Berghi","orcid":"https://orcid.org/0000-0001-6279-6364"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Davide Berghi","raw_affiliation_strings":["University of Surrey,CVSSP,UK","CVSSP, University of Surrey, UK"],"affiliations":[{"raw_affiliation_string":"University of Surrey,CVSSP,UK","institution_ids":["https://openalex.org/I28290843"]},{"raw_affiliation_string":"CVSSP, University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016376222","display_name":"Adrian Hilton","orcid":"https://orcid.org/0000-0003-4223-238X"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Adrian Hilton","raw_affiliation_strings":["University of Surrey,CVSSP,UK","CVSSP, University of Surrey, UK"],"affiliations":[{"raw_affiliation_string":"University of Surrey,CVSSP,UK","institution_ids":["https://openalex.org/I28290843"]},{"raw_affiliation_string":"CVSSP, University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5022165330","display_name":"Philip J. B. Jackson","orcid":"https://orcid.org/0000-0001-7933-5935"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Philip J.B. Jackson","raw_affiliation_strings":["University of Surrey,CVSSP,UK","CVSSP, University of Surrey, UK"],"affiliations":[{"raw_affiliation_string":"University of Surrey,CVSSP,UK","institution_ids":["https://openalex.org/I28290843"]},{"raw_affiliation_string":"CVSSP, University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5005679652"],"corresponding_institution_ids":["https://openalex.org/I28290843"],"apc_list":null,"apc_paid":null,"fwci":1.2189,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.79776876,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":"25","issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7377212047576904},{"id":"https://openalex.org/keywords/microphone-array","display_name":"Microphone array","score":0.7153412699699402},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6549272537231445},{"id":"https://openalex.org/keywords/microphone","display_name":"Microphone","score":0.5846118927001953},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5092154741287231},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.38897156715393066},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.13899707794189453}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7377212047576904},{"id":"https://openalex.org/C2778806681","wikidata":"https://www.wikidata.org/wiki/Q907293","display_name":"Microphone array","level":4,"score":0.7153412699699402},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6549272537231445},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.5846118927001953},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5092154741287231},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38897156715393066},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.13899707794189453},{"id":"https://openalex.org/C68115822","wikidata":"https://www.wikidata.org/wiki/Q1068172","display_name":"Sound pressure","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/mmsp53017.2021.9733678","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp53017.2021.9733678","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE 23rd International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:alma.44SUR_INST:11153169330002346","is_oa":false,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4210197018","display_name":"View","issn_l":"2688-268X","issn":["2688-268X","2688-3988"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320595","host_organization_name":"Wiley","host_organization_lineage":["https://openalex.org/P4310320595"],"host_organization_lineage_names":["Wiley"],"type":"journal"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":""}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.8100000023841858,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1821462560","https://openalex.org/W2037227137","https://openalex.org/W2073902875","https://openalex.org/W2154636774","https://openalex.org/W2163605009","https://openalex.org/W2171727224","https://openalex.org/W2287407690","https://openalex.org/W2511428026","https://openalex.org/W2523915246","https://openalex.org/W2954458766","https://openalex.org/W2962865004","https://openalex.org/W2963218389","https://openalex.org/W2964171275","https://openalex.org/W2982624843","https://openalex.org/W2988200020","https://openalex.org/W2989980422","https://openalex.org/W3016098309","https://openalex.org/W3025443948","https://openalex.org/W3031666665","https://openalex.org/W3034702511","https://openalex.org/W3049847664","https://openalex.org/W3188558905","https://openalex.org/W4210997624","https://openalex.org/W4287766186","https://openalex.org/W4289665794","https://openalex.org/W4293569348","https://openalex.org/W4293665662","https://openalex.org/W4393535895","https://openalex.org/W6638523607","https://openalex.org/W6684191040","https://openalex.org/W6685120548","https://openalex.org/W6729831399","https://openalex.org/W6738607494","https://openalex.org/W6765195009","https://openalex.org/W6767453888","https://openalex.org/W6779923105","https://openalex.org/W6842329589"],"related_works":["https://openalex.org/W1879255185","https://openalex.org/W4297807400","https://openalex.org/W1491159402","https://openalex.org/W4313854686","https://openalex.org/W2249138175","https://openalex.org/W1977167953","https://openalex.org/W3162054169","https://openalex.org/W1813780412","https://openalex.org/W289407349","https://openalex.org/W107154053"],"abstract_inverted_index":{"Active":[0],"speaker":[1,78,130,175],"detection":[2],"(ASD)":[3],"is":[4,15,48,147,156],"a":[5,18,93,127,143,197],"multi-modal":[6],"task":[7],"that":[8,192],"aims":[9],"to":[10,80,109,122,149,161],"identify":[11],"who,":[12],"if":[13],"anyone,":[14],"speaking":[16],"from":[17,181],"set":[19,60],"of":[20,35,61,70,83,199],"candidates.":[21,62],"Current":[22],"audio-visual":[23],"approaches":[24],"for":[25],"ASD":[26],"typically":[27],"rely":[28],"on":[29,132,187],"visually":[30],"pre-extracted":[31,133],"face":[32,37,134],"tracks":[33],"(sequences":[34],"consecutive":[36],"crops)":[38],"and":[39],"the":[40,53,59,68,77,81,89,111,117,139,163,168,174,177,182,205],"respective":[41],"monaural":[42],"audio.":[43],"However,":[44],"their":[45],"recall":[46],"rate":[47],"often":[49],"low":[50],"as":[51,202,204],"only":[52],"visible":[54],"faces":[55],"are":[56],"included":[57],"in":[58,75,104,116,176,211],"Monaural":[63],"audio":[64,90,99,158,183],"may":[65],"successfully":[66],"detect":[67],"presence":[69],"speech":[71,214],"activity":[72,215],"but":[73],"fails":[74],"localizing":[76],"due":[79],"lack":[82],"spatial":[84],"cues.":[85],"Our":[86,136],"solution":[87],"extends":[88],"front-end":[91],"using":[92,126],"microphone":[94],"array.":[95],"We":[96,120],"train":[97],"an":[98,157,212],"convolutional":[100],"neural":[101],"network":[102,146,155,159,170,207],"(CNN)":[103],"combination":[105],"with":[106],"beamforming":[107],"techniques":[108],"regress":[110],"speaker\u2019s":[112],"horizontal":[113],"position":[114],"directly":[115,180],"video":[118],"frames.":[119],"propose":[121],"generate":[123,162],"weak":[124],"labels":[125],"pre-trained":[128],"active":[129],"detector":[131,216],"tracks.":[135],"pipeline":[137],"embraces":[138],"\"student-teacher\"":[140],"paradigm,":[141],"where":[142],"trained":[144,160],"\"teacher\"":[145],"used":[148],"produce":[150],"pseudo-labels":[151],"visually.":[152],"The":[153],"\"student\"":[154],"same":[164],"results.":[165],"At":[166],"inference,":[167],"student":[169],"can":[171],"independently":[172],"localize":[173],"visual":[178],"frames":[179],"input.":[184],"Experimental":[185],"results":[186,210],"newly":[188],"collected":[189],"data":[190],"prove":[191],"our":[193],"approach":[194],"significantly":[195],"outperforms":[196],"variety":[198],"other":[200],"baselines":[201],"well":[203],"teacher":[206],"itself.":[208],"It":[209],"excellent":[213],"too.":[217]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
