{"id":"https://openalex.org/W4392903775","doi":"https://doi.org/10.1109/icassp48485.2024.10446516","title":"TransAVS: End-to-End Audio-Visual Segmentation with Transformer","display_name":"TransAVS: End-to-End Audio-Visual Segmentation with Transformer","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903775","doi":"https://doi.org/10.1109/icassp48485.2024.10446516"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446516","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10446516","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100589668","display_name":"Yuhang Ling","orcid":null},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yuhang Ling","raw_affiliation_strings":["Fudan University,School of Computer Science, Shanghai Key Laboratory of Data Science,China","School of Computer Science, Shanghai Key Laboratory of Data Science, Fudan University, China"],"affiliations":[{"raw_affiliation_string":"Fudan University,School of Computer Science, Shanghai Key Laboratory of Data Science,China","institution_ids":["https://openalex.org/I24943067"]},{"raw_affiliation_string":"School of Computer Science, Shanghai Key Laboratory of Data Science, Fudan University, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100674060","display_name":"Yuxi Li","orcid":"https://orcid.org/0000-0003-1556-598X"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuxi Li","raw_affiliation_strings":["Tencent,Youtu Lab","Youtu Lab, Tencent"],"affiliations":[{"raw_affiliation_string":"Tencent,Youtu Lab","institution_ids":["https://openalex.org/I2250653659"]},{"raw_affiliation_string":"Youtu Lab, Tencent","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012172321","display_name":"Zhenye Gan","orcid":"https://orcid.org/0000-0003-1477-4958"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenye Gan","raw_affiliation_strings":["Tencent,Youtu Lab","Youtu Lab, Tencent"],"affiliations":[{"raw_affiliation_string":"Tencent,Youtu Lab","institution_ids":["https://openalex.org/I2250653659"]},{"raw_affiliation_string":"Youtu Lab, Tencent","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021861529","display_name":"Jiangning Zhang","orcid":"https://orcid.org/0000-0001-8891-6766"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]},{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiangning Zhang","raw_affiliation_strings":["Zhejiang University","Youtu Lab, Tencent"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]},{"raw_affiliation_string":"Youtu Lab, Tencent","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057267422","display_name":"Mingmin Chi","orcid":null},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mingmin Chi","raw_affiliation_strings":["Fudan University,School of Computer Science, Shanghai Key Laboratory of Data Science,China","School of Computer Science, Shanghai Key Laboratory of Data Science, Fudan University, China","Zhongshan Fudan Joint Innovation Center, Zhongshan PoolNet Technology Co., Ltd, China"],"affiliations":[{"raw_affiliation_string":"Fudan University,School of Computer Science, Shanghai Key Laboratory of Data Science,China","institution_ids":["https://openalex.org/I24943067"]},{"raw_affiliation_string":"School of Computer Science, Shanghai Key Laboratory of Data Science, Fudan University, China","institution_ids":["https://openalex.org/I24943067"]},{"raw_affiliation_string":"Zhongshan Fudan Joint Innovation Center, Zhongshan PoolNet Technology Co., Ltd, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5028731909","display_name":"Yabiao Wang","orcid":"https://orcid.org/0000-0002-6592-8411"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]},{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yabiao Wang","raw_affiliation_strings":["Zhejiang University","Youtu Lab, Tencent"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]},{"raw_affiliation_string":"Youtu Lab, Tencent","institution_ids":["https://openalex.org/I2250653659"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100589668"],"corresponding_institution_ids":["https://openalex.org/I24943067"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.02949728,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"7845","last_page":"7849"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8362932205200195},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.7085661888122559},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.655564546585083},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5722769498825073},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5376701354980469},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.48750588297843933},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.47763124108314514},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.44567301869392395},{"id":"https://openalex.org/keywords/homogeneous","display_name":"Homogeneous","score":0.4235053062438965},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.42060238122940063},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.2741689682006836},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.19449910521507263}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8362932205200195},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.7085661888122559},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.655564546585083},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5722769498825073},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5376701354980469},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.48750588297843933},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.47763124108314514},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.44567301869392395},{"id":"https://openalex.org/C66882249","wikidata":"https://www.wikidata.org/wiki/Q169336","display_name":"Homogeneous","level":2,"score":0.4235053062438965},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.42060238122940063},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.2741689682006836},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.19449910521507263},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446516","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10446516","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1231421488","display_name":null,"funder_award_id":"under","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2728175297","display_name":null,"funder_award_id":"62171139","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W2031489346","https://openalex.org/W2048553159","https://openalex.org/W2117539524","https://openalex.org/W2194775991","https://openalex.org/W2526050071","https://openalex.org/W2565639579","https://openalex.org/W2619697695","https://openalex.org/W2908510526","https://openalex.org/W2931433835","https://openalex.org/W2962914239","https://openalex.org/W2963115079","https://openalex.org/W2963351448","https://openalex.org/W2963680395","https://openalex.org/W2964109005","https://openalex.org/W2990113535","https://openalex.org/W3021321555","https://openalex.org/W3093287838","https://openalex.org/W3108367559","https://openalex.org/W3110606395","https://openalex.org/W3118120400","https://openalex.org/W3138516171","https://openalex.org/W3153906112","https://openalex.org/W3170088426","https://openalex.org/W3170630188","https://openalex.org/W3175335326","https://openalex.org/W3212022073","https://openalex.org/W3214328324","https://openalex.org/W4287684771","https://openalex.org/W4293665662","https://openalex.org/W4312864739","https://openalex.org/W4313123347","https://openalex.org/W4376226279","https://openalex.org/W4385245566","https://openalex.org/W4386497700","https://openalex.org/W4391547487","https://openalex.org/W6729831399","https://openalex.org/W6739901393","https://openalex.org/W6757817989","https://openalex.org/W6782657842","https://openalex.org/W6793746569","https://openalex.org/W6803674551","https://openalex.org/W6804185262"],"related_works":["https://openalex.org/W2151749779","https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W3179968364","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W3137890128","https://openalex.org/W1984634519"],"abstract_inverted_index":{"Audio-Visual":[0],"Segmentation":[1],"(AVS)":[2],"is":[3],"a":[4,31],"challenging":[5],"task,":[6],"which":[7,99],"aims":[8],"to":[9,57,65,72,132,137,145,163],"segment":[10],"sounding":[11,140],"objects":[12,42,141],"in":[13,128,191],"video":[14],"frames":[15],"by":[16,40],"exploring":[17],"audio":[18,48,60,94,97,135,169,196],"signals.":[19],"Generally":[20],"AVS":[21,88],"faces":[22],"two":[23,150],"key":[24],"challenges:":[25],"(1)":[26],"Audio":[27],"signals":[28],"inherently":[29],"exhibit":[30],"high":[32],"degree":[33],"of":[34,52,143],"information":[35],"density,":[36],"as":[37,96],"sounds":[38],"produced":[39],"multiple":[41],"are":[43],"entangled":[44],"within":[45,167],"the":[46,53,82,93,129,161,185,193],"same":[47,54],"stream;":[49],"(2)":[50],"Objects":[51],"category":[55],"tend":[56],"produce":[58],"similar":[59,168],"signals,":[61],"making":[62],"it":[63],"difficult":[64],"distinguish":[66],"between":[67,195],"them":[68],"and":[69,104,157,171,197],"thus":[70],"leading":[71],"unclear":[73],"segmentation":[74,107],"results.":[75],"Toward":[76],"this":[77],"end,":[78],"we":[79,148],"propose":[80],"TransAVS,":[81],"first":[83],"Transformer-based":[84],"end-to-end":[85],"framework":[86],"for":[87],"task.":[89],"Specifically,":[90],"TransAVS":[91,180],"disentangles":[92],"stream":[95],"queries,":[98],"will":[100],"interact":[101],"with":[102,109],"images":[103],"decode":[105],"into":[106],"masks":[108],"full":[110],"transformer":[111],"architectures.":[112],"This":[113],"scheme":[114],"not":[115],"only":[116],"promotes":[117],"comprehensive":[118],"audio-image":[119],"communication":[120],"but":[121],"also":[122],"explicitly":[123],"excavates":[124],"instance":[125],"cues":[126],"encapsulated":[127],"scene.":[130],"Meanwhile,":[131],"encourage":[133],"these":[134],"queries":[136],"capture":[138,164],"distinctive":[139,165],"instead":[142],"degrading":[144],"be":[146],"homogeneous,":[147],"devise":[149],"self-supervised":[151],"loss":[152],"functions":[153],"at":[154],"both":[155],"query":[156],"mask":[158],"levels,":[159],"allowing":[160],"model":[162],"features":[166],"data":[170],"achieve":[172],"more":[173],"precise":[174],"segmentation.":[175],"Our":[176],"experiments":[177],"demonstrate":[178],"that":[179],"achieves":[181],"state-of-the-art":[182],"results":[183],"on":[184],"AVSBench":[186],"dataset,":[187],"highlighting":[188],"its":[189],"effectiveness":[190],"bridging":[192],"gap":[194],"visual":[198],"modalities.":[199]},"counts_by_year":[],"updated_date":"2026-03-18T14:38:29.013473","created_date":"2025-10-10T00:00:00"}
