{"id":"https://openalex.org/W4401201543","doi":"https://doi.org/10.1145/3664647.3681261","title":"RAVSS: Robust Audio-Visual Speech Separation in Multi-Speaker Scenarios with Missing Visual Cues","display_name":"RAVSS: Robust Audio-Visual Speech Separation in Multi-Speaker Scenarios with Missing Visual Cues","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4401201543","doi":"https://doi.org/10.1145/3664647.3681261"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681261","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681261","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2407.19224","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114012695","display_name":"Tianrui Pan","orcid":"https://orcid.org/0009-0002-8195-2005"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Tianrui Pan","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052226742","display_name":"Jie Liu","orcid":"https://orcid.org/0000-0002-9297-7729"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Liu","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","for ,"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]},{"raw_affiliation_string":"for ,","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113404637","display_name":"Bohan Wang","orcid":"https://orcid.org/0000-0002-5862-3548"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bohan Wang","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","for ,"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]},{"raw_affiliation_string":"for ,","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102012761","display_name":"Jie Tang","orcid":"https://orcid.org/0000-0002-6086-3559"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Tang","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","for ,"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]},{"raw_affiliation_string":"for ,","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101546753","display_name":"Gangshan Wu","orcid":"https://orcid.org/0000-0003-1391-1762"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gangshan Wu","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","for ,"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]},{"raw_affiliation_string":"for ,","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5114012695"],"corresponding_institution_ids":["https://openalex.org/I881766915"],"apc_list":null,"apc_paid":null,"fwci":0.3577,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.53705664,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"4748","last_page":"4756"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11233","display_name":"Advanced Adaptive Filtering Techniques","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7734541893005371},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.6600228548049927},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6398129463195801},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.5206357836723328},{"id":"https://openalex.org/keywords/separation","display_name":"Separation (statistics)","score":0.5106320977210999},{"id":"https://openalex.org/keywords/sensory-cue","display_name":"Sensory cue","score":0.5029975771903992},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4844208061695099},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4481830894947052},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.1889660656452179},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.11551240086555481}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7734541893005371},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.6600228548049927},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6398129463195801},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.5206357836723328},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.5106320977210999},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.5029975771903992},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4844208061695099},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4481830894947052},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.1889660656452179},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.11551240086555481},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3664647.3681261","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681261","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2407.19224","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.19224","pdf_url":"https://arxiv.org/pdf/2407.19224","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2407.19224","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.19224","pdf_url":"https://arxiv.org/pdf/2407.19224","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1991139021","https://openalex.org/W2594607416","https://openalex.org/W2734774145","https://openalex.org/W2808631503","https://openalex.org/W2952218014","https://openalex.org/W2981816492","https://openalex.org/W3015199127","https://openalex.org/W3096028031","https://openalex.org/W3099330747","https://openalex.org/W3160244659","https://openalex.org/W3163652268","https://openalex.org/W3182657421","https://openalex.org/W3197823486","https://openalex.org/W4206821167","https://openalex.org/W4224936432","https://openalex.org/W4243020849","https://openalex.org/W4289665794","https://openalex.org/W4295308317","https://openalex.org/W4312710798","https://openalex.org/W4312711418","https://openalex.org/W4385823228","https://openalex.org/W4386071467","https://openalex.org/W4390040710","https://openalex.org/W4392901809","https://openalex.org/W4393147127"],"related_works":["https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W3137890128","https://openalex.org/W1984634519","https://openalex.org/W4245955731","https://openalex.org/W2393726419"],"abstract_inverted_index":{"While":[0],"existing":[1],"Audio-Visual":[2],"Speech":[3],"Separation":[4],"(AVSS)":[5],"methods":[6,30],"primarily":[7],"concentrate":[8],"on":[9,94],"the":[10,24,40,55,71,95,170],"audio-visual":[11,126],"fusion":[12],"strategy":[13],"for":[14,148],"two-speaker":[15],"separation,":[16],"they":[17],"demonstrate":[18,100,162],"a":[19,63,78],"severe":[20],"performance":[21,106,172],"drop":[22,173],"in":[23,45,107],"multi-speaker":[25,65],"separation":[26,66,73],"scenarios.":[27],"Typically,":[28],"AVSS":[29],"employ":[31],"guiding":[32],"videos":[33],"to":[34,85,128,137],"sequentially":[35],"isolate":[36],"individual":[37],"speakers":[38,76,123,150],"from":[39],"given":[41],"audio":[42],"mixture,":[43],"resulting":[44],"notable":[46],"missing":[47,138],"and":[48,88,97,114,181],"noisy":[49],"parts":[50],"across":[51,174],"various":[52],"segments":[53],"of":[54,74],"separated":[56],"speech.":[57],"In":[58],"this":[59],"study,":[60],"we":[61],"propose":[62],"simultaneous":[64],"framework":[67],"that":[68,101,163],"can":[69,121],"facilitate":[70],"concurrent":[72],"multiple":[75],"within":[77],"singular":[79],"process.":[80],"We":[81,141],"introduce":[82],"speaker-wise":[83],"interactions":[84],"establish":[86],"distinctions":[87],"correlations":[89],"among":[90],"speakers.":[91,183],"Experimental":[92],"results":[93,161],"VoxCeleb2":[96],"LRS3":[98],"datasets":[99],"our":[102,119,164],"method":[103],"achieves":[104],"state-of-the-art":[105],"separating":[108],"mixtures":[109],"with":[110,124],"2,":[111,178],"3,":[112,179],"4,":[113,180],"5":[115,182],"speakers,":[116,132],"respectively.":[117],"Additionally,":[118],"model":[120,165],"utilize":[122],"complete":[125],"information":[127,147],"mitigate":[129],"other":[130],"visual-deficient":[131],"thereby":[133],"enhancing":[134],"its":[135],"resilience":[136],"visual":[139,146,155],"cues.":[140],"also":[142],"conduct":[143],"experiments":[144],"where":[145],"specific":[149],"is":[151],"entirely":[152],"absent":[153],"or":[154],"frames":[156],"are":[157],"partially":[158],"missing.":[159],"The":[160],"consistently":[166],"outperforms":[167],"others,":[168],"exhibiting":[169],"smallest":[171],"all":[175],"settings":[176],"involving":[177]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2024-08-01T00:00:00"}
