{"id":"https://openalex.org/W4312674133","doi":"https://doi.org/10.1109/mmsp55362.2022.9949329","title":"As We Speak: Real-Time Visually Guided Speaker Separation and Localization","display_name":"As We Speak: Real-Time Visually Guided Speaker Separation and Localization","publication_year":2022,"publication_date":"2022-09-26","ids":{"openalex":"https://openalex.org/W4312674133","doi":"https://doi.org/10.1109/mmsp55362.2022.9949329"},"language":"en","primary_location":{"id":"doi:10.1109/mmsp55362.2022.9949329","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp55362.2022.9949329","pdf_url":null,"source":{"id":"https://openalex.org/S4363605768","display_name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5075564775","display_name":"P. Czarnecki","orcid":"https://orcid.org/0000-0002-1411-3351"},"institutions":[{"id":"https://openalex.org/I4210128738","display_name":"Samsung (Poland)","ror":"https://ror.org/0381acm07","country_code":"PL","type":"company","lineage":["https://openalex.org/I2250650973","https://openalex.org/I4210128738"]},{"id":"https://openalex.org/I108403487","display_name":"Warsaw University of Technology","ror":"https://ror.org/00y0xnp53","country_code":"PL","type":"education","lineage":["https://openalex.org/I108403487"]}],"countries":["PL"],"is_corresponding":true,"raw_author_name":"Piotr Czarnecki","raw_affiliation_strings":["Samsung R&#x0026;D Institute,Poland","Warsaw University of Technology, Warsaw, Poland"],"affiliations":[{"raw_affiliation_string":"Samsung R&#x0026;D Institute,Poland","institution_ids":["https://openalex.org/I4210128738"]},{"raw_affiliation_string":"Warsaw University of Technology, Warsaw, Poland","institution_ids":["https://openalex.org/I108403487"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110944393","display_name":"Jakub Tkaczuk","orcid":null},"institutions":[{"id":"https://openalex.org/I108403487","display_name":"Warsaw University of Technology","ror":"https://ror.org/00y0xnp53","country_code":"PL","type":"education","lineage":["https://openalex.org/I108403487"]},{"id":"https://openalex.org/I4210128738","display_name":"Samsung (Poland)","ror":"https://ror.org/0381acm07","country_code":"PL","type":"company","lineage":["https://openalex.org/I2250650973","https://openalex.org/I4210128738"]}],"countries":["PL"],"is_corresponding":false,"raw_author_name":"Jakub Tkaczuk","raw_affiliation_strings":["Samsung R&#x0026;D Institute,Poland","Warsaw University of Technology, Warsaw, Poland"],"affiliations":[{"raw_affiliation_string":"Samsung R&#x0026;D Institute,Poland","institution_ids":["https://openalex.org/I4210128738"]},{"raw_affiliation_string":"Warsaw University of Technology, Warsaw, Poland","institution_ids":["https://openalex.org/I108403487"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5075564775"],"corresponding_institution_ids":["https://openalex.org/I108403487","https://openalex.org/I4210128738"],"apc_list":null,"apc_paid":null,"fwci":0.1227,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.29952607,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"4","issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11447","display_name":"Blind Source Separation Techniques","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8265163898468018},{"id":"https://openalex.org/keywords/panning","display_name":"Panning (audio)","score":0.7874895930290222},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.630557656288147},{"id":"https://openalex.org/keywords/separation","display_name":"Separation (statistics)","score":0.6135400533676147},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.5791091918945312},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5554243922233582},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5378185510635376},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.513009250164032},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5065029859542847},{"id":"https://openalex.org/keywords/detector","display_name":"Detector","score":0.4996650218963623},{"id":"https://openalex.org/keywords/source-separation","display_name":"Source separation","score":0.4887809455394745},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4519025683403015},{"id":"https://openalex.org/keywords/face-detection","display_name":"Face detection","score":0.44911065697669983},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4213869571685791},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.4106214940547943},{"id":"https://openalex.org/keywords/facial-recognition-system","display_name":"Facial recognition system","score":0.28173011541366577},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.25466081500053406},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.0815659761428833}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8265163898468018},{"id":"https://openalex.org/C108944566","wikidata":"https://www.wikidata.org/wiki/Q1524510","display_name":"Panning (audio)","level":4,"score":0.7874895930290222},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.630557656288147},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.6135400533676147},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.5791091918945312},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5554243922233582},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5378185510635376},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.513009250164032},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5065029859542847},{"id":"https://openalex.org/C94915269","wikidata":"https://www.wikidata.org/wiki/Q1834857","display_name":"Detector","level":2,"score":0.4996650218963623},{"id":"https://openalex.org/C2776864781","wikidata":"https://www.wikidata.org/wiki/Q52617913","display_name":"Source separation","level":2,"score":0.4887809455394745},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4519025683403015},{"id":"https://openalex.org/C4641261","wikidata":"https://www.wikidata.org/wiki/Q11681085","display_name":"Face detection","level":4,"score":0.44911065697669983},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4213869571685791},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.4106214940547943},{"id":"https://openalex.org/C31510193","wikidata":"https://www.wikidata.org/wiki/Q1192553","display_name":"Facial recognition system","level":3,"score":0.28173011541366577},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.25466081500053406},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0815659761428833},{"id":"https://openalex.org/C124913957","wikidata":"https://www.wikidata.org/wiki/Q1232548","display_name":"Zoom","level":3,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C15336307","wikidata":"https://www.wikidata.org/wiki/Q1766051","display_name":"Lens (geology)","level":2,"score":0.0},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.0},{"id":"https://openalex.org/C78762247","wikidata":"https://www.wikidata.org/wiki/Q1273174","display_name":"Petroleum engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/mmsp55362.2022.9949329","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp55362.2022.9949329","pdf_url":null,"source":{"id":"https://openalex.org/S4363605768","display_name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W2069681747","https://openalex.org/W2096391593","https://openalex.org/W2144763279","https://openalex.org/W2159126120","https://openalex.org/W2164899449","https://openalex.org/W2169896329","https://openalex.org/W2291877678","https://openalex.org/W2519091744","https://openalex.org/W2734774145","https://openalex.org/W2788241093","https://openalex.org/W2792764867","https://openalex.org/W2889442120","https://openalex.org/W2901702433","https://openalex.org/W2937484199","https://openalex.org/W2940275453","https://openalex.org/W2949756029","https://openalex.org/W2962865004","https://openalex.org/W2962935966","https://openalex.org/W2962960500","https://openalex.org/W2963103134","https://openalex.org/W2963163009","https://openalex.org/W2963321191","https://openalex.org/W2963453742","https://openalex.org/W2964171275","https://openalex.org/W2972481755","https://openalex.org/W2972513594","https://openalex.org/W2981851635","https://openalex.org/W2988200020","https://openalex.org/W3016098309","https://openalex.org/W3034552680","https://openalex.org/W3034702511","https://openalex.org/W3116298410","https://openalex.org/W3154852953","https://openalex.org/W3197011355","https://openalex.org/W3197042120","https://openalex.org/W4289665794","https://openalex.org/W6683281771","https://openalex.org/W6684193366","https://openalex.org/W6684878041","https://openalex.org/W6749825310","https://openalex.org/W6757632829","https://openalex.org/W6757716108"],"related_works":["https://openalex.org/W2591723141","https://openalex.org/W2053430337","https://openalex.org/W2785782068","https://openalex.org/W2032030284","https://openalex.org/W4300189336","https://openalex.org/W3046792021","https://openalex.org/W3169342585","https://openalex.org/W2159773886","https://openalex.org/W2744841575","https://openalex.org/W4245925285"],"abstract_inverted_index":{"Real-time":[0],"speaker":[1,28,71,84,121,133,158,182],"separation":[2,31,43,85,134,179],"and":[3,30,37,180],"localization":[4,29],"is":[5,32,125,150,167],"crucial":[6],"to":[7,26,33,87,116],"enable":[8],"applications":[9],"for":[10,44,51,69,80,107,155],"video":[11,59],"call":[12],"enhancement,":[13],"automatic":[14],"subtitles":[15],"localization,":[16],"as":[17,19],"well":[18],"spatial":[20],"voice":[21,42],"generation/panning.":[22],"The":[23,91,101],"common":[24],"approach":[25,96],"perform":[27,39,118],"detect":[34],"candidate":[35],"faces":[36],"then":[38],"visual":[40,66,82,119,156],"guided":[41,83,120,157],"each.":[45],"There":[46],"are":[47],"two":[48],"methods":[49],"used":[50],"face":[52,55,99,114],"detection:":[53],"with":[54,64,97,127,137],"detector":[56],"on":[57],"static":[58],"frames":[60],"[1],":[61],"[2]":[62],"or":[63],"audio":[65,144],"sequence":[67],"processing":[68,115],"active":[70,181],"detection":[72],"[3].":[73],"In":[74],"this":[75],"work,":[76],"we":[77],"propose":[78],"improvements":[79],"the":[81,95,138,151,161,170,176],"model":[86,93,102,171],"make":[88],"it":[89,149,166],"real-time.":[90],"described":[92],"follows":[94],"a":[98,141],"detector.":[100],"extends":[103],"real-time":[104,153],"models":[105],"known":[106],"speech":[108,178],"enhancement":[109],"[4],":[110],"[5]":[111],"by":[112],"adding":[113],"ultimately":[117],"separation.":[122,159],"Our":[123],"system":[124,154],"lightweight":[126],"0.6M":[128],"trainable":[129],"parameters.":[130],"It":[131],"performs":[132,172],"near":[135],"instantaneously":[136],"delay":[139],"of":[140,164],"single":[142],"input":[143],"frame.":[145],"To":[146],"our":[147],"knowledge,":[148],"first":[152],"From":[160],"application":[162],"point":[163],"view":[165],"important":[168],"that":[169],"both":[173],"tasks":[174],"at":[175],"time:":[177],"localization.":[183]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
