{"id":"https://openalex.org/W4387969151","doi":"https://doi.org/10.1145/3581783.3612373","title":"Audio-Visual Segmentation by Exploring Cross-Modal Mutual Semantics","display_name":"Audio-Visual Segmentation by Exploring Cross-Modal Mutual Semantics","publication_year":2023,"publication_date":"2023-10-26","ids":{"openalex":"https://openalex.org/W4387969151","doi":"https://doi.org/10.1145/3581783.3612373"},"language":"en","primary_location":{"id":"doi:10.1145/3581783.3612373","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612373","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100322208","display_name":"Chen Liu","orcid":"https://orcid.org/0000-0003-3159-0034"},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]},{"id":"https://openalex.org/I165143802","display_name":"The University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Chen Liu","raw_affiliation_strings":["The University of Queensland &amp; University of Technology Sydney, Brisbane, QLD, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Queensland &amp; University of Technology Sydney, Brisbane, QLD, Australia","institution_ids":["https://openalex.org/I165143802","https://openalex.org/I114017466"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032941294","display_name":"Peike Li","orcid":"https://orcid.org/0000-0003-1809-2137"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peike Patrick Li","raw_affiliation_strings":["Matrix Verse, Sydney, NSW, Australia"],"affiliations":[{"raw_affiliation_string":"Matrix Verse, Sydney, NSW, Australia","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026841385","display_name":"Xingqun Qi","orcid":"https://orcid.org/0000-0002-9772-5707"},"institutions":[{"id":"https://openalex.org/I4210091137","display_name":"NetEase (China)","ror":"https://ror.org/00fp6fj05","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210091137"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingqun Qi","raw_affiliation_strings":["Netease Fuxi AI Lab, Zhejiang, China"],"affiliations":[{"raw_affiliation_string":"Netease Fuxi AI Lab, Zhejiang, China","institution_ids":["https://openalex.org/I4210091137"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115580345","display_name":"Hu Zhang","orcid":"https://orcid.org/0009-0009-9892-9515"},"institutions":[{"id":"https://openalex.org/I165143802","display_name":"The University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Hu Zhang","raw_affiliation_strings":["The University of Queensland, Brisbane, QLD, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Queensland, Brisbane, QLD, Australia","institution_ids":["https://openalex.org/I165143802"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019036797","display_name":"Lincheng Li","orcid":"https://orcid.org/0000-0002-6047-0472"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lincheng Li","raw_affiliation_strings":["Netease Fuxi AI Lab, Zhejiang, Australia"],"affiliations":[{"raw_affiliation_string":"Netease Fuxi AI Lab, Zhejiang, Australia","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053719139","display_name":"Dadong Wang","orcid":"https://orcid.org/0000-0003-0409-2259"},"institutions":[{"id":"https://openalex.org/I1292875679","display_name":"Commonwealth Scientific and Industrial Research Organisation","ror":"https://ror.org/03qn8fb07","country_code":"AU","type":"government","lineage":["https://openalex.org/I1292875679","https://openalex.org/I2801453606","https://openalex.org/I4387156119"]},{"id":"https://openalex.org/I42894916","display_name":"Data61","ror":"https://ror.org/03q397159","country_code":"AU","type":"other","lineage":["https://openalex.org/I1292875679","https://openalex.org/I2801453606","https://openalex.org/I42894916","https://openalex.org/I4387156119"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Dadong Wang","raw_affiliation_strings":["CSIRO DATA61, Sydney, NSW, Australia"],"affiliations":[{"raw_affiliation_string":"CSIRO DATA61, Sydney, NSW, Australia","institution_ids":["https://openalex.org/I42894916","https://openalex.org/I1292875679"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003076238","display_name":"Xin Yu","orcid":"https://orcid.org/0000-0002-0269-5649"},"institutions":[{"id":"https://openalex.org/I165143802","display_name":"The University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Xin Yu","raw_affiliation_strings":["The University of Queensland, Brisbane, QLD, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Queensland, Brisbane, QLD, Australia","institution_ids":["https://openalex.org/I165143802"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100322208"],"corresponding_institution_ids":["https://openalex.org/I114017466","https://openalex.org/I165143802"],"apc_list":null,"apc_paid":null,"fwci":5.6902,"has_fulltext":false,"cited_by_count":29,"citation_normalized_percentile":{"value":0.97143034,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"7590","last_page":"7598"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12357","display_name":"Digital Media Forensic Detection","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7922247648239136},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6848927140235901},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6369906663894653},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5973342657089233},{"id":"https://openalex.org/keywords/depth-sounding","display_name":"Depth sounding","score":0.5861025452613831},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.5579509139060974},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5548897981643677},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4225214123725891},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.41248324513435364},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.1830105483531952},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.08810287714004517}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7922247648239136},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6848927140235901},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6369906663894653},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5973342657089233},{"id":"https://openalex.org/C55510283","wikidata":"https://www.wikidata.org/wiki/Q1382947","display_name":"Depth sounding","level":2,"score":0.5861025452613831},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.5579509139060974},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5548897981643677},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4225214123725891},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.41248324513435364},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.1830105483531952},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.08810287714004517},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3581783.3612373","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612373","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.41999998688697815}],"awards":[{"id":"https://openalex.org/G1426318481","display_name":null,"funder_award_id":"grant","funder_id":"https://openalex.org/F4320322725","funder_display_name":"China Scholarship Council"},{"id":"https://openalex.org/G2300736770","display_name":null,"funder_award_id":"(CSC)","funder_id":"https://openalex.org/F4320322725","funder_display_name":"China Scholarship Council"},{"id":"https://openalex.org/G2747171834","display_name":null,"funder_award_id":"50092128","funder_id":"https://openalex.org/F4320320386","funder_display_name":"Commonwealth Scientific and Industrial Research Organisation"},{"id":"https://openalex.org/G4132985236","display_name":null,"funder_award_id":"unknown","funder_id":"https://openalex.org/F4320322725","funder_display_name":"China Scholarship Council"},{"id":"https://openalex.org/G8589651859","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320322725","funder_display_name":"China Scholarship Council"}],"funders":[{"id":"https://openalex.org/F4320320386","display_name":"Commonwealth Scientific and Industrial Research Organisation","ror":"https://ror.org/03qn8fb07"},{"id":"https://openalex.org/F4320322725","display_name":"China Scholarship Council","ror":"https://ror.org/04atp4p48"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2412782625","https://openalex.org/W2485522583","https://openalex.org/W2560023338","https://openalex.org/W2593116425","https://openalex.org/W2737258237","https://openalex.org/W2962865004","https://openalex.org/W2963150697","https://openalex.org/W2963351448","https://openalex.org/W2963680395","https://openalex.org/W2964309882","https://openalex.org/W2999219213","https://openalex.org/W3008923235","https://openalex.org/W3092603041","https://openalex.org/W3105352633","https://openalex.org/W3108367559","https://openalex.org/W3138516171","https://openalex.org/W3144223439","https://openalex.org/W3170088426","https://openalex.org/W3180659539","https://openalex.org/W3206231676","https://openalex.org/W3213165621","https://openalex.org/W3217061668","https://openalex.org/W4212847156","https://openalex.org/W4213271552","https://openalex.org/W4214893857","https://openalex.org/W4221167476","https://openalex.org/W4224925617","https://openalex.org/W4247981280","https://openalex.org/W4282582789","https://openalex.org/W4283709432","https://openalex.org/W4287775237","https://openalex.org/W4309660795","https://openalex.org/W4312420092","https://openalex.org/W4312653918","https://openalex.org/W4312815172","https://openalex.org/W4313123347","https://openalex.org/W4319300768","https://openalex.org/W4319300806","https://openalex.org/W6600007113","https://openalex.org/W6600565697"],"related_works":["https://openalex.org/W2353179089","https://openalex.org/W2809612451","https://openalex.org/W2923538289","https://openalex.org/W1974079146","https://openalex.org/W2353125546","https://openalex.org/W4383874204","https://openalex.org/W2470643824","https://openalex.org/W2373976954","https://openalex.org/W2616411725","https://openalex.org/W2349635380"],"abstract_inverted_index":{"The":[0],"audio-visual":[1,92,199,249],"segmentation":[2,94,117,158,166,174],"(AVS)":[3],"task":[4],"aims":[5],"to":[6,28,41,76,82,96,176,196,216,239,247,270],"segment":[7,42,264],"sounding":[8,30,58,79,109,123,138,162,192,226,265],"objects":[9,59,80,110,163,266,272],"from":[10],"a":[11,25,43,48,102,112,137,144,171],"given":[12,26,128],"video.":[13,149],"Existing":[14],"works":[15],"mainly":[16],"focus":[17],"on":[18,254],"fusing":[19],"audio":[20,53,186,205,213],"and":[21,119,202,220,273,282],"visual":[22],"features":[23],"of":[24,51,185],"video":[27,49,113,142],"achieve":[29],"object":[31,46,116,124,134,139,157],"masks.":[32,167],"However,":[33],"we":[34,89,194,210,233,244],"observed":[35],"that":[36,132,259],"prior":[37],"arts":[38],"are":[39,60,245],"prone":[40],"certain":[44],"salient":[45,64,271],"in":[47,66,111,140,147,154,278],"regardless":[50],"the":[52,62,67,83,98,122,127,178,182,198,235,241,255,280],"information.":[54],"This":[55,150],"is":[56,187],"because":[57],"often":[61],"most":[63],"ones":[65],"AVS":[68,72,256],"dataset.":[69],"Thus,":[70],"current":[71],"methods":[73],"might":[74],"fail":[75],"localize":[77],"genuine":[78],"due":[81],"dataset":[84,99],"bias.":[85,100],"In":[86,101],"this":[87],"work,":[88],"present":[90],"an":[91,115,133],"instance-aware":[93],"approach":[95],"overcome":[97],"nutshell,":[103],"our":[104,156,260],"method":[105,261],"first":[106],"localizes":[107],"potential":[108,207,217],"by":[114],"network,":[118],"then":[120,203],"associates":[121],"candidates":[125],"with":[126,206],"audio.":[129],"We":[130,168],"notice":[131],"could":[135],"be":[136],"one":[141,146],"but":[143],"silent":[145,172],"another":[148],"would":[151],"bring":[152],"ambiguity":[153],"training":[155],"network":[159],"as":[160],"only":[161],"have":[164],"corresponding":[165,225],"thus":[169],"propose":[170,195],"object-aware":[173],"objective":[175],"alleviate":[177],"ambiguity.":[179],"Moreover,":[180],"since":[181],"category":[183,214],"information":[184],"unknown,":[188],"especially":[189],"for":[190],"multiple":[191],"sources,":[193],"explore":[197],"semantic":[200],"correlation":[201],"associate":[204],"objects.":[208],"Specifically,":[209],"attend":[211],"predicted":[212],"scores":[215,222],"instance":[218,237],"masks":[219,238],"these":[221],"will":[223],"highlight":[224],"instances":[227],"while":[228],"suppressing":[229],"inaudible":[230],"ones.":[231],"When":[232],"enforce":[234],"attended":[236],"resemble":[240],"ground-truth":[242],"mask,":[243],"able":[246],"establish":[248],"semantics":[250],"correlation.":[251],"Experimental":[252],"results":[253],"benchmarks":[257],"demonstrate":[258],"can":[262],"effectively":[263],"without":[267],"being":[268],"biased":[269],"also":[274],"achieves":[275],"state-of-the-art":[276],"performance":[277],"both":[279],"single-source":[281],"multi-source":[283],"scenarios.":[284]},"counts_by_year":[{"year":2025,"cited_by_count":9},{"year":2024,"cited_by_count":20}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
