{"id":"https://openalex.org/W7138002674","doi":"https://doi.org/10.48550/arxiv.2603.14203","title":"Selective Noise Suppression and Discriminative Mutual Interaction for Robust Audio-Visual Segmentation","display_name":"Selective Noise Suppression and Discriminative Mutual Interaction for Robust Audio-Visual Segmentation","publication_year":2026,"publication_date":"2026-03-15","ids":{"openalex":"https://openalex.org/W7138002674","doi":"https://doi.org/10.48550/arxiv.2603.14203"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.14203","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14203","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.14203","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129704848","display_name":"Kai Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Peng, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129675923","display_name":"Yunzhe Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Yunzhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129727250","display_name":"Miao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Miao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113419194","display_name":"Leiye Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Leiye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129707249","display_name":"Yidong Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Yidong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129708591","display_name":"Wei Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129732146","display_name":"Jingjing Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jingjing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014975416","display_name":"Yongri Piao","orcid":"https://orcid.org/0000-0002-0860-252X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Piao, Yongri","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129684419","display_name":"Huchuan Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Huchuan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5129704848"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.4984999895095825,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.4984999895095825,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.36489999294281006,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.04399999976158142,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.8891000151634216},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6498000025749207},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.6019999980926514},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.48660001158714294},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.428600013256073},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.4178999960422516},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.39660000801086426},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.3849000036716461}],"concepts":[{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.8891000151634216},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7567999958992004},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6498000025749207},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6384000182151794},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.6019999980926514},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5511999726295471},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.48660001158714294},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.428600013256073},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.4178999960422516},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.41350001096725464},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.39660000801086426},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.3849000036716461},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.3792000114917755},{"id":"https://openalex.org/C32022120","wikidata":"https://www.wikidata.org/wiki/Q797225","display_name":"Interference (communication)","level":3,"score":0.35679998993873596},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3402999937534332},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3361000120639801},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.32010000944137573},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.2802000045776367},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.27489998936653137},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.26980000734329224},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2635999917984009}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.14203","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14203","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.14203","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14203","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7820913195610046,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,99],"ability":[1],"to":[2,47],"capture":[3],"and":[4,35,74,91,136,141],"segment":[5],"sounding":[6],"objects":[7],"in":[8,28,134],"dynamic":[9],"visual":[10,36,75],"scenes":[11],"is":[12],"crucial":[13],"for":[14],"the":[15,31,49,72,85,92],"development":[16],"of":[17],"Audio-Visual":[18,94],"Segmentation":[19],"(AVS)":[20],"tasks.":[21],"While":[22],"significant":[23],"progress":[24],"has":[25],"been":[26],"made":[27],"this":[29,43,78],"area,":[30],"interaction":[32,70],"between":[33,71],"audio":[34,58,63,73,103],"modalities":[37],"still":[38],"requires":[39],"further":[40],"exploration.":[41],"In":[42],"work,":[44],"we":[45,67,80],"aim":[46],"answer":[48],"following":[50],"questions:":[51],"How":[52,65],"can":[53,66],"a":[54],"model":[55,142],"effectively":[56],"suppress":[57],"noise":[59,104],"while":[60,112],"enhancing":[61],"relevant":[62,109],"information?":[64],"achieve":[68],"discriminative":[69],"modalities?":[76],"To":[77],"end,":[79],"propose":[81],"SDAVS,":[82],"equipped":[83],"with":[84],"Selective":[86],"Noise-Resilient":[87],"Processor":[88],"(SNRP)":[89],"module":[90],"Discriminative":[93],"Mutual":[95],"Fusion":[96],"(DAMF)":[97],"strategy.":[98],"proposed":[100,124],"SNRP":[101],"mitigates":[102],"interference":[105],"by":[106],"selectively":[107],"emphasizing":[108],"auditory":[110],"cues,":[111],"DAMF":[113],"ensures":[114],"more":[115],"consistent":[116],"audio-visual":[117],"representations.":[118],"Experimental":[119],"results":[120],"demonstrate":[121],"that":[122],"our":[123],"method":[125],"achieves":[126],"state-of-the-art":[127],"performance":[128],"on":[129],"benchmark":[130],"AVS":[131],"datasets,":[132],"especially":[133],"multi-source":[135],"complex":[137],"scenes.":[138],"\\textit{The":[139],"code":[140],"are":[143],"available":[144],"at":[145],"https://github.com/happylife-pk/SDAVS}.":[146]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
