{"id":"https://openalex.org/W3150049814","doi":"https://doi.org/10.1109/icme51207.2021.9428196","title":"Weakly-Supervised Audio-Visual Sound Source Detection and Separation","display_name":"Weakly-Supervised Audio-Visual Sound Source Detection and Separation","publication_year":2021,"publication_date":"2021-06-09","ids":{"openalex":"https://openalex.org/W3150049814","doi":"https://doi.org/10.1109/icme51207.2021.9428196","mag":"3150049814"},"language":"en","primary_location":{"id":"doi:10.1109/icme51207.2021.9428196","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme51207.2021.9428196","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2104.02606","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112457904","display_name":"Tanzila Rahman","orcid":null},"institutions":[{"id":"https://openalex.org/I141945490","display_name":"University of British Columbia","ror":"https://ror.org/03rmrcq20","country_code":"CA","type":"education","lineage":["https://openalex.org/I141945490"]},{"id":"https://openalex.org/I4210127509","display_name":"Vector Institute","ror":"https://ror.org/03kqdja62","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210127509"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Tanzila Rahman","raw_affiliation_strings":["University of British Columbia","[University of British Columbia, Vector Institute for AI]"],"affiliations":[{"raw_affiliation_string":"University of British Columbia","institution_ids":["https://openalex.org/I141945490"]},{"raw_affiliation_string":"[University of British Columbia, Vector Institute for AI]","institution_ids":["https://openalex.org/I4210127509","https://openalex.org/I141945490"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053011888","display_name":"Leonid Sigal","orcid":"https://orcid.org/0000-0002-3942-2804"},"institutions":[{"id":"https://openalex.org/I141945490","display_name":"University of British Columbia","ror":"https://ror.org/03rmrcq20","country_code":"CA","type":"education","lineage":["https://openalex.org/I141945490"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Leonid Sigal","raw_affiliation_strings":["University of British Columbia",", University of British Columbia"],"affiliations":[{"raw_affiliation_string":"University of British Columbia","institution_ids":["https://openalex.org/I141945490"]},{"raw_affiliation_string":", University of British Columbia","institution_ids":["https://openalex.org/I141945490"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5112457904"],"corresponding_institution_ids":["https://openalex.org/I141945490","https://openalex.org/I4210127509"],"apc_list":null,"apc_paid":null,"fwci":0.3078,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.51685016,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"3","issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11233","display_name":"Advanced Adaptive Filtering Techniques","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.7786205410957336},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7777653932571411},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.695525050163269},{"id":"https://openalex.org/keywords/source-separation","display_name":"Source separation","score":0.6424638032913208},{"id":"https://openalex.org/keywords/minimum-bounding-box","display_name":"Minimum bounding box","score":0.6124500036239624},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5988802313804626},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5551272034645081},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5297088623046875},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5201502442359924},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5145720839500427},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.48410141468048096},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.4753284752368927},{"id":"https://openalex.org/keywords/computational-auditory-scene-analysis","display_name":"Computational auditory scene analysis","score":0.46151185035705566},{"id":"https://openalex.org/keywords/separation","display_name":"Separation (statistics)","score":0.4437955617904663},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.43139421939849854},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.41855764389038086},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3930791914463043},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3882770836353302},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.15507099032402039},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.14997735619544983},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.07088685035705566},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.0670098066329956}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.7786205410957336},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7777653932571411},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.695525050163269},{"id":"https://openalex.org/C2776864781","wikidata":"https://www.wikidata.org/wiki/Q52617913","display_name":"Source separation","level":2,"score":0.6424638032913208},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.6124500036239624},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5988802313804626},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5551272034645081},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5297088623046875},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5201502442359924},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5145720839500427},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.48410141468048096},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.4753284752368927},{"id":"https://openalex.org/C73208851","wikidata":"https://www.wikidata.org/wiki/Q5157303","display_name":"Computational auditory scene analysis","level":2,"score":0.46151185035705566},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.4437955617904663},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.43139421939849854},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.41855764389038086},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3930791914463043},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3882770836353302},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.15507099032402039},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.14997735619544983},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.07088685035705566},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0670098066329956},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/icme51207.2021.9428196","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme51207.2021.9428196","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2104.02606","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2104.02606","pdf_url":"https://arxiv.org/pdf/2104.02606","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:3150049814","is_oa":true,"landing_page_url":"http://export.arxiv.org/pdf/2104.02606","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2104.02606","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2104.02606","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2104.02606","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2104.02606","pdf_url":"https://arxiv.org/pdf/2104.02606","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/11","display_name":"Sustainable cities and communities","score":0.7200000286102295}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3150049814.pdf","grobid_xml":"https://content.openalex.org/works/W3150049814.grobid-xml"},"referenced_works_count":26,"referenced_works":["https://openalex.org/W1555814299","https://openalex.org/W1945608308","https://openalex.org/W1991139021","https://openalex.org/W2120847449","https://openalex.org/W2194775991","https://openalex.org/W2285559681","https://openalex.org/W2511428026","https://openalex.org/W2562637781","https://openalex.org/W2618253329","https://openalex.org/W2798122215","https://openalex.org/W2962756039","https://openalex.org/W2962865004","https://openalex.org/W2962960500","https://openalex.org/W2963218389","https://openalex.org/W2963430551","https://openalex.org/W2964001806","https://openalex.org/W2981816492","https://openalex.org/W2981851635","https://openalex.org/W2988200020","https://openalex.org/W2993182889","https://openalex.org/W6714030504","https://openalex.org/W6729831399","https://openalex.org/W6750469568","https://openalex.org/W6750591037","https://openalex.org/W6757240503","https://openalex.org/W7071105756"],"related_works":["https://openalex.org/W3172237229","https://openalex.org/W3202586499","https://openalex.org/W3154852953","https://openalex.org/W3153167542","https://openalex.org/W2963680395","https://openalex.org/W2899380617","https://openalex.org/W2124208211","https://openalex.org/W3042057498","https://openalex.org/W2977457315","https://openalex.org/W2949302831","https://openalex.org/W2780718603","https://openalex.org/W2754124089","https://openalex.org/W3092603041","https://openalex.org/W2809502869","https://openalex.org/W2595280704","https://openalex.org/W2235576802","https://openalex.org/W3049224640","https://openalex.org/W3114883909","https://openalex.org/W2626926650","https://openalex.org/W84456536"],"abstract_inverted_index":{"Learning":[0],"how":[1],"to":[2],"localize":[3],"and":[4,48,75,146],"separate":[5],"individual":[6,45],"object":[7,56,88,118],"sounds":[8],"in":[9,71,90],"the":[10,14,40,91,115,129],"audio":[11,24,62],"channel":[12],"of":[13,93,105,117],"video":[15],"is":[16],"a":[17,103,121],"difficult":[18],"task.":[19],"Current":[20],"state-of-the-art":[21,138],"methods":[22,139],"predict":[23],"masks":[25],"from":[26,51],"artificially":[27],"mixed":[28],"spectrograms,":[29],"known":[30],"as":[31],"Mix-and-Separate":[32],"framework.":[33],"We":[34,96],"propose":[35],"an":[36,72],"audio-visual":[37],"co-segmentation,":[38],"where":[39],"network":[41],"learns":[42],"both":[43],"what":[44],"objects":[46],"look":[47],"sound":[49,94,143,147],"like,":[50],"videos":[52],"labeled":[53],"with":[54],"only":[55],"labels.":[57],"Unlike":[58],"other":[59],"recent":[60],"visually-guided":[61],"source":[63,144],"separation":[64,145],"frameworks,":[65],"our":[66,134],"architecture":[67],"can":[68],"be":[69],"learned":[70,106],"end-to-end":[73],"manner":[74],"requires":[76],"no":[77],"additional":[78],"supervision":[79],"or":[80],"bounding":[81],"box":[82],"proposals.":[83],"Specifically,":[84],"we":[85],"introduce":[86],"weakly-supervised":[87],"segmentation":[89,119],"context":[92],"separation.":[95,125],"also":[97],"formulate":[98],"spectrogram":[99],"mask":[100,107],"prediction":[101],"using":[102,111],"set":[104],"bases,":[108],"which":[109],"combine":[110],"coefficients":[112],"conditioned":[113],"on":[114,128,140],"output":[116],"\u2014":[120],"design":[122],"that":[123,133],"facilitates":[124],"Extensive":[126],"experiments":[127],"MUSIC":[130],"dataset":[131],"show":[132],"proposed":[135],"approach":[136],"outperforms":[137],"visually":[141],"guided":[142],"denoising.":[148]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2021,"cited_by_count":2}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
