{"id":"https://openalex.org/W2005402103","doi":"https://doi.org/10.1109/icassp.2014.6853854","title":"Look who's talking: Detecting the dominant speaker in a cluttered scenario","display_name":"Look who's talking: Detecting the dominant speaker in a cluttered scenario","publication_year":2014,"publication_date":"2014-05-01","ids":{"openalex":"https://openalex.org/W2005402103","doi":"https://doi.org/10.1109/icassp.2014.6853854","mag":"2005402103"},"language":"en","primary_location":{"id":"doi:10.1109/icassp.2014.6853854","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2014.6853854","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083533632","display_name":"Eleonora D'Arca","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Eleonora D'Arca","raw_affiliation_strings":["Joint Research Institute for Signal and Image Processing, University of Edinburgh, UK","Joint Res. Inst. for Signal & Image Process., Heriot-Watt Univ. & Univ. of Edinburgh, Edinburgh, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Joint Research Institute for Signal and Image Processing, University of Edinburgh, UK","institution_ids":["https://openalex.org/I98677209"]},{"raw_affiliation_string":"Joint Res. Inst. for Signal & Image Process., Heriot-Watt Univ. & Univ. of Edinburgh, Edinburgh, UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102909686","display_name":"Neil M. Robertson","orcid":"https://orcid.org/0000-0003-2461-8799"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Neil M. Robertson","raw_affiliation_strings":["Joint Research Institute for Signal and Image Processing, University of Edinburgh, UK","Joint Res. Inst. for Signal & Image Process., Heriot-Watt Univ. & Univ. of Edinburgh, Edinburgh, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Joint Research Institute for Signal and Image Processing, University of Edinburgh, UK","institution_ids":["https://openalex.org/I98677209"]},{"raw_affiliation_string":"Joint Res. Inst. for Signal & Image Process., Heriot-Watt Univ. & Univ. of Edinburgh, Edinburgh, UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016434897","display_name":"James R. Hopgood","orcid":"https://orcid.org/0000-0002-3029-2425"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"James R. Hopgood","raw_affiliation_strings":["Joint Research Institute for Signal and Image Processing, University of Edinburgh, UK","Joint Res. Inst. for Signal & Image Process., Heriot-Watt Univ. & Univ. of Edinburgh, Edinburgh, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Joint Research Institute for Signal and Image Processing, University of Edinburgh, UK","institution_ids":["https://openalex.org/I98677209"]},{"raw_affiliation_string":"Joint Res. Inst. for Signal & Image Process., Heriot-Watt Univ. & Univ. of Edinburgh, Edinburgh, UK","institution_ids":["https://openalex.org/I98677209"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.4743,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.81956733,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":"17","issue":null,"first_page":"1532","last_page":"1536"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11665","display_name":"Animal Vocal Communication and Behavior","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1309","display_name":"Developmental Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8165581226348877},{"id":"https://openalex.org/keywords/mel-frequency-cepstrum","display_name":"Mel-frequency cepstrum","score":0.6493794918060303},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5835354924201965},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5808886885643005},{"id":"https://openalex.org/keywords/cepstrum","display_name":"Cepstrum","score":0.5132915377616882},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.49406173825263977},{"id":"https://openalex.org/keywords/optical-flow","display_name":"Optical flow","score":0.47757962346076965},{"id":"https://openalex.org/keywords/audio-analyzer","display_name":"Audio analyzer","score":0.4704880714416504},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.46655696630477905},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.4627615809440613},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.44281625747680664},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.35376477241516113},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3293578028678894},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.2747035622596741},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.16103774309158325},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.11112415790557861},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09522581100463867}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8165581226348877},{"id":"https://openalex.org/C151989614","wikidata":"https://www.wikidata.org/wiki/Q440370","display_name":"Mel-frequency cepstrum","level":3,"score":0.6493794918060303},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5835354924201965},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5808886885643005},{"id":"https://openalex.org/C88485024","wikidata":"https://www.wikidata.org/wiki/Q1054571","display_name":"Cepstrum","level":2,"score":0.5132915377616882},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.49406173825263977},{"id":"https://openalex.org/C155542232","wikidata":"https://www.wikidata.org/wiki/Q736111","display_name":"Optical flow","level":3,"score":0.47757962346076965},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.4704880714416504},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.46655696630477905},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.4627615809440613},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.44281625747680664},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.35376477241516113},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3293578028678894},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2747035622596741},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.16103774309158325},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.11112415790557861},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09522581100463867},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp.2014.6853854","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2014.6853854","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.qub.ac.uk/portal:publications/5c7a539c-3dfb-482b-bd08-b607bf890c49","is_oa":false,"landing_page_url":"https://pure.qub.ac.uk/en/publications/5c7a539c-3dfb-482b-bd08-b607bf890c49","pdf_url":null,"source":{"id":"https://openalex.org/S4306402319","display_name":"Research Portal (Queen's University Belfast)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I126231945","host_organization_name":"Queen's University Belfast","host_organization_lineage":["https://openalex.org/I126231945"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"D'Arca , E , Robertson , N &amp; Hopgood , J R 2014 , ' Look who's talking: Detecting the dominant speaker in a cluttered scenario ' , pp. 1532 -- 1536 . https://doi.org/10.1109/ICASSP.2014.6853854","raw_type":"conferenceObject"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.7599999904632568,"display_name":"Peace, Justice and strong institutions"}],"awards":[{"id":"https://openalex.org/G8726282573","display_name":null,"funder_award_id":"EP/K014277/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W1579093989","https://openalex.org/W1587928245","https://openalex.org/W1975745930","https://openalex.org/W1982547756","https://openalex.org/W2012637506","https://openalex.org/W2065274193","https://openalex.org/W2104380558","https://openalex.org/W2127025755","https://openalex.org/W2129421456","https://openalex.org/W2129866629","https://openalex.org/W2138414774","https://openalex.org/W2140647972","https://openalex.org/W2141463036","https://openalex.org/W2142612029","https://openalex.org/W2145097187","https://openalex.org/W2148882869","https://openalex.org/W2163141813","https://openalex.org/W2165734786","https://openalex.org/W2171819471","https://openalex.org/W2501398076","https://openalex.org/W4237723258","https://openalex.org/W6681054008"],"related_works":["https://openalex.org/W2098934641","https://openalex.org/W2494533082","https://openalex.org/W4214771044","https://openalex.org/W4382560817","https://openalex.org/W4387698063","https://openalex.org/W1975359510","https://openalex.org/W3004352674","https://openalex.org/W3110605476","https://openalex.org/W1803351015","https://openalex.org/W2363106653"],"abstract_inverted_index":{"In":[0,87,129],"this":[1,130],"work":[2,184],"we":[3,67,132],"propose":[4],"a":[5,176,186,195,201],"novel":[6],"method":[7,73],"to":[8,74,96,124,142,154],"automatically":[9],"detect":[10],"and":[11,24,57,79,102,139,151,190,207],"localise":[12],"the":[13,44,69,72,97,108,143,149,156,159,164,171],"dominant":[14,98,203],"speaker":[15,204],"in":[16,175,199],"an":[17,40],"enclosed":[18],"scenario":[19],"by":[20],"means":[21,33,38],"of":[22,46,71,77,158,167,173],"audio":[23,41,63,99,150],"video":[25,152],"cues.":[26],"The":[27],"underpinning":[28],"idea":[29],"is":[30],"that":[31],"gesturing":[32],"speaking,":[34],"so":[35],"observing":[36,39],"motions":[37],"signal.":[42],"To":[43],"best":[45],"our":[47,183],"knowledge":[48],"state-of-the-art":[49,187],"algorithms":[50],"are":[51,93],"focussed":[52],"on":[53,192],"stationary":[54],"motion":[55,104],"scenarios":[56,81],"close-up":[58],"scenes":[59],"where":[60],"only":[61],"one":[62],"source":[64],"exists,":[65],"whereas":[66],"enlarge":[68],"extent":[70],"larger":[75],"field":[76],"views":[78],"cluttered":[80],"including":[82],"multiple":[83],"non-stationary":[84],"moving":[85,90,202],"speakers.":[86],"such":[88],"contexts,":[89],"objects":[91],"which":[92],"not":[94],"correlated":[95],"may":[100,105,118],"exist":[101],"their":[103],"incorrectly":[106],"drive":[107],"audio-video":[109],"(AV)":[110],"correlation":[111],"estimation.":[112],"This":[113],"suggests":[114],"extra":[115],"localisation":[116],"data":[117],"be":[119],"fused":[120],"at":[121],"decision":[122],"level":[123],"avoid":[125],"detecting":[126],"false":[127],"positives.":[128],"work,":[131],"learn":[133],"Mel-frequency":[134],"cepstral":[135],"coefficients":[136,138],"(MFCC)":[137],"correlate":[140],"them":[141],"optical":[144],"flow.":[145],"We":[146,181],"also":[147],"exploit":[148],"signals":[153],"estimate":[155],"position":[157],"actual":[160],"speaker,":[161],"narrowing":[162],"down":[163],"visual":[165],"space":[166],"search,":[168],"hence":[169],"reducing":[170],"probability":[172],"incurring":[174],"wrong":[177],"voice-to-pixel":[178],"region":[179],"association.":[180],"compare":[182],"with":[185],"existing":[188],"algorithm":[189],"show":[191],"real":[193],"datasets":[194],"36%":[196],"precision":[197],"improvement":[198],"localising":[200],"through":[205],"occlusions":[206],"speech":[208],"interferences.":[209]},"counts_by_year":[{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":1},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":1},{"year":2016,"cited_by_count":2},{"year":2015,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
