{"id":"https://openalex.org/W4401607728","doi":"https://doi.org/10.1109/taslp.2024.3444490","title":"Enhancing Conformer-Based Sound Event Detection Using Frequency Dynamic Convolutions and BEATs Audio Embeddings","display_name":"Enhancing Conformer-Based Sound Event Detection Using Frequency Dynamic Convolutions and BEATs Audio Embeddings","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4401607728","doi":"https://doi.org/10.1109/taslp.2024.3444490"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3444490","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3444490","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1109/taslp.2024.3444490","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106528898","display_name":"Sara Barahona","orcid":"https://orcid.org/0009-0001-8519-0549"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sara Barahona","raw_affiliation_strings":["AUDIAS Research Group, Escuela Polit&#x00E9;cnica Superior, Universidad Aut&#x00F3;noma de Madrid, Madrid, Spain"],"affiliations":[{"raw_affiliation_string":"AUDIAS Research Group, Escuela Polit&#x00E9;cnica Superior, Universidad Aut&#x00F3;noma de Madrid, Madrid, Spain","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027598609","display_name":"Diego de Benito-Gorr\u00f3n","orcid":"https://orcid.org/0000-0002-3725-2522"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Diego de Benito-Gorr\u00f3n","raw_affiliation_strings":["AUDIAS Research Group, Escuela Polit&#x00E9;cnica Superior, Universidad Aut&#x00F3;noma de Madrid, Madrid, Spain"],"affiliations":[{"raw_affiliation_string":"AUDIAS Research Group, Escuela Polit&#x00E9;cnica Superior, Universidad Aut&#x00F3;noma de Madrid, Madrid, Spain","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089438058","display_name":"Doroteo T. Toledano","orcid":"https://orcid.org/0000-0003-1159-6455"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Doroteo T. Toledano","raw_affiliation_strings":["AUDIAS Research Group, Escuela Polit&#x00E9;cnica Superior, Universidad Aut&#x00F3;noma de Madrid, Madrid, Spain"],"affiliations":[{"raw_affiliation_string":"AUDIAS Research Group, Escuela Polit&#x00E9;cnica Superior, Universidad Aut&#x00F3;noma de Madrid, Madrid, Spain","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057509834","display_name":"Daniel Ramos","orcid":"https://orcid.org/0000-0001-5998-1489"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daniel Ramos","raw_affiliation_strings":["AUDIAS Research Group, Escuela Polit&#x00E9;cnica Superior, Universidad Aut&#x00F3;noma de Madrid, Madrid, Spain"],"affiliations":[{"raw_affiliation_string":"AUDIAS Research Group, Escuela Polit&#x00E9;cnica Superior, Universidad Aut&#x00F3;noma de Madrid, Madrid, Spain","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5106528898"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.7077,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.84892963,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"32","issue":null,"first_page":"3896","last_page":"3907"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sound","display_name":"Sound (geography)","score":0.5983066558837891},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.5502352118492126},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.4832914173603058},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4490965008735657},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4289487898349762},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.3134163022041321},{"id":"https://openalex.org/keywords/astrophysics","display_name":"Astrophysics","score":0.07471463084220886}],"concepts":[{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.5983066558837891},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.5502352118492126},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.4832914173603058},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4490965008735657},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4289487898349762},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.3134163022041321},{"id":"https://openalex.org/C44870925","wikidata":"https://www.wikidata.org/wiki/Q37547","display_name":"Astrophysics","level":1,"score":0.07471463084220886}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/taslp.2024.3444490","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3444490","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:repositorio.uam.es:10486/714808","is_oa":true,"landing_page_url":"http://hdl.handle.net/10486/714808","pdf_url":"https://repositorio.uam.es/bitstream/10486/714808/1/enhancing_barahona_TASLP_2024.pdf","source":{"id":"https://openalex.org/S4306400963","display_name":"Biblos-e Archivo (Universidad Aut\u00f3noma de Madrid)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I63634437","host_organization_name":"Universidad Aut\u00f3noma de Madrid","host_organization_lineage":["https://openalex.org/I63634437"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"doi:10.1109/taslp.2024.3444490","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3444490","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1622782223","display_name":null,"funder_award_id":"MCIN/AEI/10","funder_id":"https://openalex.org/F4320335322","funder_display_name":"European Regional Development Fund"},{"id":"https://openalex.org/G2176187742","display_name":null,"funder_award_id":"PID2021-125943OB-I00","funder_id":"https://openalex.org/F4320335322","funder_display_name":"European Regional Development Fund"},{"id":"https://openalex.org/G2262748287","display_name":null,"funder_award_id":"501100011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G3429648993","display_name":null,"funder_award_id":"PID202","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G3480869486","display_name":null,"funder_award_id":"13039","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G4071968812","display_name":null,"funder_award_id":"MCIN/AEI/10.13039/501100011033/FEDER","funder_id":"https://openalex.org/F4320338080","funder_display_name":"European Social Fund"},{"id":"https://openalex.org/G4126322094","display_name":null,"funder_award_id":"01100011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G451917667","display_name":null,"funder_award_id":"13039/501100011033","funder_id":"https://openalex.org/F4320335322","funder_display_name":"European Regional Development Fund"},{"id":"https://openalex.org/G4816244515","display_name":null,"funder_award_id":"FEDER","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G586857820","display_name":null,"funder_award_id":"501100011033/FEDER","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G5967599077","display_name":null,"funder_award_id":"501100011033","funder_id":"https://openalex.org/F4320335322","funder_display_name":"European Regional Development Fund"},{"id":"https://openalex.org/G6071709581","display_name":null,"funder_award_id":"13039/501100011033/","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G661330594","display_name":null,"funder_award_id":"00110","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G6685425346","display_name":null,"funder_award_id":"0011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G7084143925","display_name":null,"funder_award_id":"AEI/10","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G7266728691","display_name":null,"funder_award_id":"13039/501100011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G7685797761","display_name":null,"funder_award_id":"PID2021-125943OB-I00","funder_id":"https://openalex.org/F4320338080","funder_display_name":"European Social Fund"},{"id":"https://openalex.org/G8260616629","display_name":null,"funder_award_id":"011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G8570012161","display_name":null,"funder_award_id":"unknown","funder_id":"https://openalex.org/F4320335322","funder_display_name":"European Regional Development Fund"}],"funders":[{"id":"https://openalex.org/F4320335322","display_name":"European Regional Development Fund","ror":"https://ror.org/00k4n6c32"},{"id":"https://openalex.org/F4320335598","display_name":"Agencia Estatal de Investigaci\u00f3n","ror":null},{"id":"https://openalex.org/F4320338080","display_name":"European Social Fund","ror":"https://ror.org/00k4n6c32"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":43,"referenced_works":["https://openalex.org/W2052666245","https://openalex.org/W2086384421","https://openalex.org/W2112796928","https://openalex.org/W2219249508","https://openalex.org/W2408239454","https://openalex.org/W2591013610","https://openalex.org/W2593116425","https://openalex.org/W2771361008","https://openalex.org/W2896457183","https://openalex.org/W2948981900","https://openalex.org/W2959539607","https://openalex.org/W2964110616","https://openalex.org/W2981733351","https://openalex.org/W3015190346","https://openalex.org/W3082704281","https://openalex.org/W3087468906","https://openalex.org/W3097777922","https://openalex.org/W3169030202","https://openalex.org/W3180060682","https://openalex.org/W3203468141","https://openalex.org/W3206996142","https://openalex.org/W3207059654","https://openalex.org/W3209059054","https://openalex.org/W3211278025","https://openalex.org/W4205689591","https://openalex.org/W4221149441","https://openalex.org/W4221154745","https://openalex.org/W4224920041","https://openalex.org/W4297841853","https://openalex.org/W4318594283","https://openalex.org/W4327781337","https://openalex.org/W4372270119","https://openalex.org/W4385245566","https://openalex.org/W6688816777","https://openalex.org/W6733814495","https://openalex.org/W6745136726","https://openalex.org/W6763608318","https://openalex.org/W6779574021","https://openalex.org/W6784333009","https://openalex.org/W6796761347","https://openalex.org/W6798264668","https://openalex.org/W6810673746","https://openalex.org/W6848208918"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2935759653","https://openalex.org/W3105167352","https://openalex.org/W54078636","https://openalex.org/W2954470139","https://openalex.org/W1501425562","https://openalex.org/W2902782467","https://openalex.org/W3084825885","https://openalex.org/W2298861036","https://openalex.org/W2271181815"],"abstract_inverted_index":{"Over":[0],"the":[1,7,39,78,83,108,146,156,175,180,200,203],"last":[2],"few":[3],"years,":[4],"most":[5],"of":[6,59,155,177,202,214],"tasks":[8],"employing":[9,20],"Deep":[10],"Learning":[11],"techniques":[12],"for":[13,82,159],"audio":[14],"processing":[15],"have":[16],"achieved":[17,53],"state-of-the-art":[18],"results":[19,206],"Conformer-based":[21,51,104],"systems.":[22],"However,":[23],"when":[24],"it":[25,32,37],"comes":[26],"to":[27,64,101,106,129,144,150,170,187],"sound":[28,60],"event":[29],"detection":[30],"(SED),":[31],"was":[33],"scarcely":[34],"used":[35],"after":[36],"won":[38],"DCASE":[40],"Challenge":[41],"2020":[42],"Task":[43],"4.":[44],"In":[45],"previous":[46],"research,":[47],"we":[48,99,116,135,173],"found":[49],"that":[50,77],"systems":[52,220],"a":[54,103,118,152,211],"higher":[55],"performance":[56,109,147],"in":[57,96],"terms":[58],"events":[61],"classification":[62,132],"compared":[63],"other":[65],"architectures":[66],"frequently":[67],"employed,":[68],"such":[69],"as":[70],"Convolutional":[71],"Recurrent":[72],"Neural":[73],"Networks":[74],"(CRNNs).":[75],"Given":[76],"second":[79],"scenario":[80],"proposed":[81,124,139],"Polyphonic":[84],"Sound":[85],"Detection":[86],"Score":[87],"(PSDS2)":[88],"is":[89],"focused":[90],"on":[91,110,222],"avoiding":[92],"confusion":[93],"between":[94],"classes,":[95],"this":[97,111,114],"paper":[98],"propose":[100],"optimize":[102],"system":[105],"maximize":[107],"scenario.":[112],"For":[113],"purpose,":[115],"performed":[117],"hyperparameter":[119],"tuning":[120],"and":[121,164,166,216],"incorporated":[122],"recently":[123],"Frequency":[125],"Dynamic":[126],"Convolutions":[127],"(FDY)":[128],"enhance":[130,145],"its":[131,162],"properties.":[133],"Additionally,":[134,172],"employed":[136],"our":[137],"previously":[138],"multi-resolution":[140],"approach":[141],"not":[142],"only":[143],"but":[148],"also":[149],"gain":[151],"deeper":[153],"understanding":[154],"Conformer":[157,204],"architecture":[158],"SED,":[160],"analyzing":[161],"advantages":[163],"disadvantages,":[165],"finding":[167],"possible":[168],"solutions":[169],"them.":[171],"explored":[174],"integration":[176],"embeddings":[178,198],"from":[179,192],"pre-trained":[181],"model":[182],"BEATs,":[183],"an":[184],"iterative":[185],"framework":[186],"learn":[188],"Bidirectional":[189],"Encoder":[190],"representation":[191],"Audio":[193],"Transformers.":[194],"By":[195],"concatenating":[196],"these":[197],"into":[199],"input":[201],"blocks,":[205],"were":[207],"further":[208],"improved,":[209],"achieving":[210],"PSDS2":[212],"value":[213],"0.813":[215],"considerably":[217],"outperforming":[218],"SED":[219],"based":[221],"CRNNs.":[223]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":4}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
