{"id":"https://openalex.org/W4415708520","doi":"https://doi.org/10.1109/icme59968.2025.11209970","title":"Boosting Audio-Visual Segmentation via Triple-Modalities Alignment","display_name":"Boosting Audio-Visual Segmentation via Triple-Modalities Alignment","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415708520","doi":"https://doi.org/10.1109/icme59968.2025.11209970"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11209970","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209970","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109692762","display_name":"Yujian Lee","orcid":"https://orcid.org/0009-0003-2514-3913"},"institutions":[{"id":"https://openalex.org/I12615008","display_name":"Beijing Normal University - Hong Kong Baptist University United International College","ror":"https://ror.org/04snvc712","country_code":"CN","type":"education","lineage":["https://openalex.org/I12615008"]},{"id":"https://openalex.org/I141568987","display_name":"Hong Kong Baptist University","ror":"https://ror.org/0145fw131","country_code":"HK","type":"education","lineage":["https://openalex.org/I141568987"]}],"countries":["CN","HK"],"is_corresponding":true,"raw_author_name":"Yujian Lee","raw_affiliation_strings":["Beijing Normal-Hong Kong Baptist University,Guangdong Provincial/Zhuhai Key Laboratory of IRADS,Zhuhai,China"],"affiliations":[{"raw_affiliation_string":"Beijing Normal-Hong Kong Baptist University,Guangdong Provincial/Zhuhai Key Laboratory of IRADS,Zhuhai,China","institution_ids":["https://openalex.org/I12615008","https://openalex.org/I141568987"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106667221","display_name":"Peng Gao","orcid":"https://orcid.org/0009-0005-7881-712X"},"institutions":[{"id":"https://openalex.org/I141568987","display_name":"Hong Kong Baptist University","ror":"https://ror.org/0145fw131","country_code":"HK","type":"education","lineage":["https://openalex.org/I141568987"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Peng Gao","raw_affiliation_strings":["Hong Kong Baptist University,Dept. Computer Science,Hong Kong, SAR"],"affiliations":[{"raw_affiliation_string":"Hong Kong Baptist University,Dept. Computer Science,Hong Kong, SAR","institution_ids":["https://openalex.org/I141568987"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059312754","display_name":"Zailong Chen","orcid":"https://orcid.org/0009-0003-8431-5471"},"institutions":[{"id":"https://openalex.org/I204824540","display_name":"University of Wollongong","ror":"https://ror.org/00jtmb277","country_code":"AU","type":"education","lineage":["https://openalex.org/I204824540"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Zailong Chen","raw_affiliation_strings":["University of Wollongong,Dept. Computer Science,Wollongong,Australia"],"affiliations":[{"raw_affiliation_string":"University of Wollongong,Dept. Computer Science,Wollongong,Australia","institution_ids":["https://openalex.org/I204824540"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056455910","display_name":"Wentao Fan","orcid":"https://orcid.org/0000-0001-6694-7289"},"institutions":[{"id":"https://openalex.org/I141568987","display_name":"Hong Kong Baptist University","ror":"https://ror.org/0145fw131","country_code":"HK","type":"education","lineage":["https://openalex.org/I141568987"]},{"id":"https://openalex.org/I12615008","display_name":"Beijing Normal University - Hong Kong Baptist University United International College","ror":"https://ror.org/04snvc712","country_code":"CN","type":"education","lineage":["https://openalex.org/I12615008"]}],"countries":["CN","HK"],"is_corresponding":false,"raw_author_name":"Wentao Fan","raw_affiliation_strings":["Beijing Normal-Hong Kong Baptist University,Guangdong Provincial/Zhuhai Key Laboratory of IRADS,Zhuhai,China"],"affiliations":[{"raw_affiliation_string":"Beijing Normal-Hong Kong Baptist University,Guangdong Provincial/Zhuhai Key Laboratory of IRADS,Zhuhai,China","institution_ids":["https://openalex.org/I12615008","https://openalex.org/I141568987"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111209168","display_name":"Guquan Jing","orcid":null},"institutions":[{"id":"https://openalex.org/I141568987","display_name":"Hong Kong Baptist University","ror":"https://ror.org/0145fw131","country_code":"HK","type":"education","lineage":["https://openalex.org/I141568987"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Guquan Jing","raw_affiliation_strings":["Hong Kong Baptist University,Dept. Computer Science,Hong Kong, SAR"],"affiliations":[{"raw_affiliation_string":"Hong Kong Baptist University,Dept. Computer Science,Hong Kong, SAR","institution_ids":["https://openalex.org/I141568987"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5109637442","display_name":"Yiyang Hu","orcid":"https://orcid.org/0000-0003-3478-3616"},"institutions":[{"id":"https://openalex.org/I141568987","display_name":"Hong Kong Baptist University","ror":"https://ror.org/0145fw131","country_code":"HK","type":"education","lineage":["https://openalex.org/I141568987"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yiyang Hu","raw_affiliation_strings":["Hong Kong Baptist University,Dept. Computer Science,Hong Kong, SAR"],"affiliations":[{"raw_affiliation_string":"Hong Kong Baptist University,Dept. Computer Science,Hong Kong, SAR","institution_ids":["https://openalex.org/I141568987"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5109692762"],"corresponding_institution_ids":["https://openalex.org/I12615008","https://openalex.org/I141568987"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.42450008,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.5045999884605408,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.5045999884605408,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.32170000672340393,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.0357000008225441,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.7085999846458435},{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.635699987411499},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.45559999346733093},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.44999998807907104},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4072999954223633},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.3767000138759613},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.3546999990940094},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.3400999903678894}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8039000034332275},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.7085999846458435},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6934000253677368},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.635699987411499},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.47679999470710754},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.45559999346733093},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.44999998807907104},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4072999954223633},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3767000138759613},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3546999990940094},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.3400999903678894},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3249000012874603},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.3237999975681305},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2985000014305115},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.29580000042915344},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.29089999198913574},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2874999940395355},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.28349998593330383},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2800000011920929},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C65885262","wikidata":"https://www.wikidata.org/wiki/Q7429708","display_name":"Scale-space segmentation","level":4,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11209970","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209970","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W2105767494","https://openalex.org/W2593116425","https://openalex.org/W2619383789","https://openalex.org/W2963115079","https://openalex.org/W2963925437","https://openalex.org/W3138516171","https://openalex.org/W4312815172","https://openalex.org/W4313123347","https://openalex.org/W4385767885","https://openalex.org/W4386076140","https://openalex.org/W4387969495","https://openalex.org/W4393148237","https://openalex.org/W4393159433","https://openalex.org/W4393160420","https://openalex.org/W4393178550","https://openalex.org/W4394625876","https://openalex.org/W4403386295","https://openalex.org/W4404531452","https://openalex.org/W4404970580"],"related_works":[],"abstract_inverted_index":{"The":[0],"Audio-Visual":[1],"Segmentation":[2],"(AVS)":[3],"task":[4],"aims":[5],"to":[6,33,47,132],"identify":[7],"sound-producing":[8],"objects":[9],"in":[10],"the":[11,125,134,138,145,153,163],"visual":[12,65,104,107,110],"domain":[13,135],"using":[14],"auditory":[15],"cues.":[16],"Enhancing":[17],"segmentation":[18],"efficiency":[19],"by":[20],"incorporating":[21],"prior":[22],"knowledge,":[23],"such":[24],"as":[25],"object":[26],"locations":[27],"and":[28,50,67,92,105,108,111,151],"textual":[29,68],"prompts,":[30],"has":[31],"proven":[32],"be":[34],"crucial.":[35],"However,":[36],"existing":[37],"methods":[38],"suffer":[39],"from":[40,73],"feature":[41],"misalignment":[42],"during":[43],"model":[44,79],"training,":[45],"leading":[46],"ineffective":[48],"integration":[49],"reduced":[51],"performance.":[52,156],"To":[53,122],"address":[54],"this,":[55],"we":[56,81,127],"propose":[57],"Triple-modalities":[58],"alignment":[59,119],"(TM-align),":[60],"which":[61],"combines":[62],"audio":[63],"signals,":[64],"images,":[66],"prompts.":[69],"By":[70,142],"leveraging":[71],"prompts":[72],"a":[74],"frozen":[75],"multi-modal":[76],"large":[77],"language":[78],"(MLLM),":[80],"extract":[82],"two":[83,114,140],"types":[84],"of":[85,101,115,137],"semantic":[86,89],"information:":[87],"contextual":[88],"description":[90],"(C.S.D)":[91],"prompt":[93],"specific":[94],"summary":[95],"(P.S.S).":[96],"TM-align":[97,148,161],"yields":[98],"three":[99,146],"pairs":[100],"aligned":[102],"features:":[103],"C.S.D,":[106],"P.S.S,":[109],"audio,":[112],"within":[113],"our":[116],"proposed":[117],"cross-modalities":[118],"(CMA)":[120],"models.":[121,166],"further":[123],"enhance":[124],"alignment,":[126],"employ":[128],"Jensen-Shannon":[129],"Divergence":[130],"(JSD)":[131],"regulate":[133],"distribution":[136],"latter":[139],"features.":[141],"effectively":[143],"aligning":[144],"modalities,":[147],"reduces":[149],"redundancy":[150],"improves":[152],"overall":[154],"AVS":[155,165],"Experimental":[157],"results":[158],"demonstrate":[159],"that":[160],"outperforms":[162],"mainstream":[164],"<sup":[167],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[168],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[169]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-30T00:00:00"}
