{"id":"https://openalex.org/W4415112498","doi":"https://doi.org/10.48550/arxiv.2506.12785","title":"Frequency Dynamic Convolutions for Sound Event Detection","display_name":"Frequency Dynamic Convolutions for Sound Event Detection","publication_year":2025,"publication_date":"2025-06-15","ids":{"openalex":"https://openalex.org/W4415112498","doi":"https://doi.org/10.48550/arxiv.2506.12785"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2506.12785","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.12785","pdf_url":"https://arxiv.org/pdf/2506.12785","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2506.12785","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5068022654","display_name":"Hyeonuk Nam","orcid":"https://orcid.org/0000-0002-1169-5640"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Nam, Hyeonuk","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5068022654"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9934999942779541,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9847000241279602,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.6811000108718872},{"id":"https://openalex.org/keywords/weighting","display_name":"Weighting","score":0.6603000164031982},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5501999855041504},{"id":"https://openalex.org/keywords/basis","display_name":"Basis (linear algebra)","score":0.4952999949455261},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.4607999920845032},{"id":"https://openalex.org/keywords/time\u2013frequency-analysis","display_name":"Time\u2013frequency analysis","score":0.4059999883174896},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3741999864578247},{"id":"https://openalex.org/keywords/frequency-response","display_name":"Frequency response","score":0.37130001187324524}],"concepts":[{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.6811000108718872},{"id":"https://openalex.org/C183115368","wikidata":"https://www.wikidata.org/wiki/Q856577","display_name":"Weighting","level":2,"score":0.6603000164031982},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5501999855041504},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5267000198364258},{"id":"https://openalex.org/C12426560","wikidata":"https://www.wikidata.org/wiki/Q189569","display_name":"Basis (linear algebra)","level":2,"score":0.4952999949455261},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.4607999920845032},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4422000050544739},{"id":"https://openalex.org/C142433447","wikidata":"https://www.wikidata.org/wiki/Q7806653","display_name":"Time\u2013frequency analysis","level":3,"score":0.4059999883174896},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3741999864578247},{"id":"https://openalex.org/C8590192","wikidata":"https://www.wikidata.org/wiki/Q1054694","display_name":"Frequency response","level":2,"score":0.37130001187324524},{"id":"https://openalex.org/C5917680","wikidata":"https://www.wikidata.org/wiki/Q2621825","display_name":"Basis function","level":2,"score":0.34139999747276306},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.33719998598098755},{"id":"https://openalex.org/C70136482","wikidata":"https://www.wikidata.org/wiki/Q13583781","display_name":"A-weighting","level":3,"score":0.3368000090122223},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3312999904155731},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32330000400543213},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3174999952316284},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3068000078201294},{"id":"https://openalex.org/C19118579","wikidata":"https://www.wikidata.org/wiki/Q786423","display_name":"Frequency domain","level":2,"score":0.29249998927116394},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.263700008392334},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.25760000944137573},{"id":"https://openalex.org/C2780757906","wikidata":"https://www.wikidata.org/wiki/Q5276676","display_name":"Dilation (metric space)","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2506.12785","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.12785","pdf_url":"https://arxiv.org/pdf/2506.12785","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2506.12785","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.12785","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2506.12785","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.12785","pdf_url":"https://arxiv.org/pdf/2506.12785","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"research":[1],"in":[2,122,470,485,508,512,534],"deep":[3,535],"learning-based":[4,536],"Sound":[5],"Event":[6],"Detection":[7],"(SED)":[8],"has":[9,120],"primarily":[10],"focused":[11],"on":[12,65,91,106,384,497],"Convolutional":[13],"Recurrent":[14],"Neural":[15],"Networks":[16],"(CRNNs)":[17],"and":[18,33,192,395,454,522],"Transformer":[19],"models.":[20,170],"However,":[21,117],"conventional":[22,531],"2D":[23,248,532],"convolution-based":[24],"models":[25,507],"assume":[26],"shift":[27],"invariance":[28],"along":[29,188,354],"both":[30],"the":[31,66,70,107,114,129,144,185,189,208,216,271,275,300,329,333,355,406,414,458,475],"temporal":[32],"frequency":[34,67,82,155,190],"axes,":[35],"leadin":[36],"to":[37,73,102,113,138,151,183,253,282,351,361,530],"inconsistencies":[38],"when":[39],"dealing":[40],"with":[41,179,230,250,311,427,450],"frequency-dependent":[42,320],"characteristics":[43],"of":[44,69,128,167,220,277,298,416,440,460],"acoustic":[45],"signals.":[46],"To":[47,157,366],"address":[48],"this":[49,51,161,369],"issue,":[50],"study":[52,162,370],"proposes":[53,371],"Frequency":[54],"Dynamic":[55],"Convolution":[56],"(FDY":[57],"conv),":[58,375],"which":[59,223,347,376],"dynamically":[60],"adjusts":[61],"convolutional":[62,177,309],"kernels":[63,89,127,178,252,310],"based":[64,90],"composition":[68],"input":[71],"signal":[72,483],"enhance":[74,193],"SED":[75],"performance.":[76,259],"FDY":[77,100,118,168,172,211,221,234,283,286,341,436],"conv":[78,101,119,169,173,202,212,235,245,265,287,292,316,327,342,373,404,411,437,444,456,465,479,491,504],"constructs":[79],"an":[80,164],"optimal":[81],"response":[83],"by":[84,110,205,268,279,293,337,418],"adaptively":[85],"weighting":[86],"multiple":[87,308],"basis":[88,126,146],"frequency-specific":[92,141,194],"attention":[93],"weights.":[94],"Experimental":[95,197,260,322],"results":[96,198,224,261,323,517],"show":[97,199],"that":[98,123,200,263,325,382,391,399,435,519],"applying":[99,299],"CRNNs":[103],"improves":[104,203,266,438],"performance":[105,204,267,336,408,432,469],"DESED":[108],"dataset":[109],"7.56%":[111],"compared":[112,281],"baseline":[115,272,334],"CRNN.":[116],"limitations":[121],"it":[124],"combines":[125],"same":[130,301,407],"shape":[131],"across":[132,303],"all":[133,227,304,352],"frequencies,":[134],"restricting":[135],"its":[136,295,359],"ability":[137,360],"capture":[139,152,363],"diverse":[140,319],"characteristics.":[142],"Additionally,":[143,463],"$3\\times3$":[145],"kernel":[147],"size":[148],"is":[149,445],"insufficient":[150],"a":[153,527],"broader":[154],"range.":[156],"overcome":[158,367],"these":[159],"limitations,":[160],"introduces":[163],"extended":[165,524],"family":[166],"Dilated":[171],"(DFD":[174],"conv)":[175,214,289],"applies":[176],"various":[180],"dilation":[181,302,313],"rates":[182],"expand":[184],"receptive":[186],"field":[187],"axis":[191],"feature":[195],"representation.":[196],"DFD":[201,291,443,490],"9.27%":[206],"over":[207,270],"baseline.":[209],"Partial":[210],"(PFD":[213],"addresses":[215],"high":[217],"computational":[218,255,429],"cost":[219],"conv,":[222],"from":[225],"performing":[226],"convolution":[228],"operations":[229],"dynamic":[231],"kernels.":[232],"Since":[233],"may":[236],"introduce":[237],"unnecessary":[238],"adaptivity":[239],"for":[240,448],"quasi-stationary":[241,461],"sound":[242],"events,":[243,442],"PFD":[244,264,455,478],"integrates":[246,377],"standard":[247,340],"convolutions":[249,521,533],"frequency-adaptive":[251,520],"reduce":[254],"complexity":[256],"while":[257,273,502],"maintaining":[258],"demonstrate":[262],"7.80%":[269],"reducing":[274],"number":[276,415],"parameters":[278,417],"54.4%":[280],"conv.":[284],"Multi-Dilated":[285],"(MDFD":[288],"extends":[290],"addressing":[294],"structural":[296],"limitation":[297],"frequencies.":[305],"By":[306],"utilizing":[307],"different":[312],"rates,":[314],"MDFD":[315,326,410],"effectively":[317,362,480],"captures":[318,400,481],"patterns.":[321],"indicate":[324],"achieves":[328,405],"highest":[330],"performance,":[331],"improving":[332],"CRNN":[335],"10.98%.":[338],"Furthermore,":[339],"employs":[343],"Temporal":[344,378],"Average":[345,396],"Pooling,":[346],"assigns":[348],"equal":[349],"weight":[350],"frames":[353],"time":[356],"axis,":[357],"limiting":[358],"transient":[364,393,472,510],"events.":[365,462,473],"this,":[368],"TAP-FDY":[372,403],"(TFD":[374],"Attention":[379,388],"Pooling":[380,389,397],"(TA)":[381],"focuses":[383],"salient":[385],"features,":[386,453],"Velocity":[387],"(VA)":[390],"emphasizes":[392],"characteristics,":[394],"(AP)":[398],"stationary":[401],"properties.":[402],"as":[409],"but":[412],"reduces":[413],"approximately":[419],"30.01%":[420],"(12.703M":[421],"vs.":[422],"18.157M),":[423],"achieving":[424],"equivalent":[425],"accuracy":[426],"lower":[428],"complexity.":[430],"Class-wise":[431],"analysis":[433],"reveals":[434],"detection":[439,459],"non-stationary":[441],"particularly":[446],"effective":[447],"events":[449],"broad":[451],"spectral":[452,495],"enhances":[457],"TFD":[464,503],"(TFD-CRNN)":[466],"demonstrates":[467],"strong":[468],"detecting":[471,509],"In":[474],"case":[476],"studies,":[477],"stable":[482],"patterns":[484,496],"tank":[486],"powertrain":[487],"fault":[488,500],"recognition,":[489,501],"recognizes":[492],"wide":[493],"harmonic":[494],"speed-varying":[498],"motor":[499],"outperforms":[505],"other":[506],"signals":[511],"offshore":[513],"arc":[514],"detection.":[515],"These":[516],"suggest":[518],"their":[523],"variants":[525],"provide":[526],"robust":[528],"alternative":[529],"audio":[537],"processing.":[538]},"counts_by_year":[],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-14T00:00:00"}
