{"id":"https://openalex.org/W4415708688","doi":"https://doi.org/10.1109/icme59968.2025.11210146","title":"Incorporating Audio-Guided Visual Attention into Sound Event Localization and Detection with Source Distance Estimation","display_name":"Incorporating Audio-Guided Visual Attention into Sound Event Localization and Detection with Source Distance Estimation","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415708688","doi":"https://doi.org/10.1109/icme59968.2025.11210146"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11210146","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210146","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100434900","display_name":"Qing Wang","orcid":"https://orcid.org/0000-0003-3843-3920"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qing Wang","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103586766","display_name":"Jun Du","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Du","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102560553","display_name":"Hengyi Hong","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hengyi Hong","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101385418","display_name":"Maocheng Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Maocheng Hu","raw_affiliation_strings":["National Intelligent Voice Innovation Center,Hefei,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Intelligent Voice Innovation Center,Hefei,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5105507458","display_name":"Mingqi Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mingqi Cai","raw_affiliation_strings":["iFlytek Research,Hefei,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"iFlytek Research,Hefei,China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101716884","display_name":"Xin Fang","orcid":"https://orcid.org/0000-0003-4796-9444"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xin Fang","raw_affiliation_strings":["iFlytek Research,Hefei,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"iFlytek Research,Hefei,China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.34800182,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8532000184059143,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8532000184059143,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.1290999948978424,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.002300000051036477,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7235000133514404},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.70169997215271},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.5788999795913696},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.541700005531311},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.51910001039505},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.49070000648498535},{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.43700000643730164},{"id":"https://openalex.org/keywords/saliency-map","display_name":"Saliency map","score":0.4293000102043152}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7558000087738037},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7235000133514404},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.70169997215271},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5927000045776367},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.5788999795913696},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.541700005531311},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.51910001039505},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5170000195503235},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.49070000648498535},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.44350001215934753},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.43700000643730164},{"id":"https://openalex.org/C2779679900","wikidata":"https://www.wikidata.org/wiki/Q25304431","display_name":"Saliency map","level":3,"score":0.4293000102043152},{"id":"https://openalex.org/C2986089797","wikidata":"https://www.wikidata.org/wiki/Q6501338","display_name":"Visual attention","level":3,"score":0.4124999940395355},{"id":"https://openalex.org/C93240960","wikidata":"https://www.wikidata.org/wiki/Q217270","display_name":"Acoustic source localization","level":3,"score":0.39489999413490295},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.37369999289512634},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.34779998660087585},{"id":"https://openalex.org/C68236139","wikidata":"https://www.wikidata.org/wiki/Q765652","display_name":"Sound localization","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2922999858856201},{"id":"https://openalex.org/C96250715","wikidata":"https://www.wikidata.org/wiki/Q965330","display_name":"Estimation","level":2,"score":0.2754000127315521},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.2671999931335449}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11210146","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210146","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W2194775991","https://openalex.org/W2810934215","https://openalex.org/W2942551338","https://openalex.org/W2963115079","https://openalex.org/W2964109005","https://openalex.org/W3163193264","https://openalex.org/W3170088426","https://openalex.org/W3203177955","https://openalex.org/W3206329344","https://openalex.org/W4312367758","https://openalex.org/W4324116353","https://openalex.org/W4372263497","https://openalex.org/W4375868779","https://openalex.org/W4380032323","https://openalex.org/W4390096820","https://openalex.org/W4392903703","https://openalex.org/W4392904168","https://openalex.org/W4392904420","https://openalex.org/W4393241166","https://openalex.org/W4402112272","https://openalex.org/W4402979486","https://openalex.org/W4404577061","https://openalex.org/W4408352510"],"related_works":[],"abstract_inverted_index":{"Sound":[0],"event":[1,23,40],"localization":[2,41],"and":[3,12,26,42,73,88,103],"detection":[4,24,43],"(SELD)":[5],"is":[6],"a":[7,17,79],"task":[8,35,84],"that":[9,85,94],"involves":[10],"identifying":[11],"locating":[13],"sound":[14,22,39,138],"events":[15],"in":[16,136],"given":[18],"environment,":[19],"which":[20],"combines":[21],"(SED)":[25],"direction-of-arrival":[27],"(DOA)":[28],"estimation.":[29,90],"This":[30],"study":[31],"addresses":[32],"the":[33,100,107,114,118,123,131,145],"extended":[34],"of":[36,106,117,133],"audio-visual":[37],"(AV)":[38],"with":[44],"source":[45,80],"distance":[46,89],"estimation":[47,82],"(3D":[48],"SELD).":[49],"To":[50],"leverage":[51],"effective":[52],"visual":[53,59,74,141],"information,":[54],"we":[55,77],"propose":[56],"an":[57],"audio-guided":[58],"attention":[60],"mechanism":[61],"to":[62,70],"extract":[63],"location-based":[64],"features.":[65,75],"We":[66],"use":[67],"two":[68],"methods":[69],"fuse":[71],"audio":[72,134],"Additionally,":[76],"introduce":[78],"coordinate":[81],"(SCE)":[83],"integrates":[86],"DOA":[87],"Experimental":[91],"results":[92],"demonstrate":[93],"our":[95],"proposed":[96],"model":[97],"significantly":[98],"outperforms":[99],"official":[101],"audio-only":[102],"AV":[104],"baselines":[105],"DCASE":[108],"2024":[109],"Challenge":[110],"Task":[111],"3":[112],"on":[113],"development":[115],"set":[116],"STARSS23":[119],"dataset,":[120],"even":[121],"surpassing":[122],"challenge\u2019s":[124],"winning":[125],"method.":[126],"Attention":[127],"visualization":[128],"further":[129],"highlights":[130],"effectiveness":[132],"information":[135],"localizing":[137],"sources":[139],"within":[140],"images,":[142],"ultimately":[143],"enhancing":[144],"3D":[146],"SELD":[147],"performance.":[148],"Codes":[149],"are":[150],"available":[151],"at":[152],"https://github.com/qingwang24/AGVA-3DSELD/.":[153]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-30T00:00:00"}
