{"id":"https://openalex.org/W4372263379","doi":"https://doi.org/10.1109/icassp49357.2023.10096853","title":"AST-SED: An Effective Sound Event Detection Method Based on Audio Spectrogram Transformer","display_name":"AST-SED: An Effective Sound Event Detection Method Based on Audio Spectrogram Transformer","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372263379","doi":"https://doi.org/10.1109/icassp49357.2023.10096853"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096853","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096853","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://figshare.com/articles/conference_contribution/AST-SED_An_Effective_Sound_Event_Detection_Method_Based_on_Audio_Spectrogram_Transformer/24217350","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100386630","display_name":"Shuo Li","orcid":"https://orcid.org/0000-0002-5184-3230"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Kang Li","raw_affiliation_strings":["University Of Science And Technology Of China,National Engineering Research Centre of Speech and Language Information Processing,Hefei,China","National Engineering Research Centre of Speech and Language Information Processing, University Of Science And Technology Of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University Of Science And Technology Of China,National Engineering Research Centre of Speech and Language Information Processing,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Research Centre of Speech and Language Information Processing, University Of Science And Technology Of China, Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100381758","display_name":"Yan Song","orcid":"https://orcid.org/0000-0002-5668-9068"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yan Song","raw_affiliation_strings":["University Of Science And Technology Of China,National Engineering Research Centre of Speech and Language Information Processing,Hefei,China","National Engineering Research Centre of Speech and Language Information Processing, University Of Science And Technology Of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University Of Science And Technology Of China,National Engineering Research Centre of Speech and Language Information Processing,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Research Centre of Speech and Language Information Processing, University Of Science And Technology Of China, Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057227915","display_name":"Li-Rong Dai","orcid":"https://orcid.org/0000-0002-0859-2827"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li-Rong Dai","raw_affiliation_strings":["University Of Science And Technology Of China,National Engineering Research Centre of Speech and Language Information Processing,Hefei,China","National Engineering Research Centre of Speech and Language Information Processing, University Of Science And Technology Of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University Of Science And Technology Of China,National Engineering Research Centre of Speech and Language Information Processing,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Research Centre of Speech and Language Information Processing, University Of Science And Technology Of China, Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000620878","display_name":"Ian McLoughlin","orcid":"https://orcid.org/0000-0001-7111-2008"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]},{"id":"https://openalex.org/I168639165","display_name":"Singapore Institute of Technology","ror":"https://ror.org/01v2c2791","country_code":"SG","type":"education","lineage":["https://openalex.org/I168639165"]}],"countries":["CN","SG"],"is_corresponding":false,"raw_author_name":"Ian McLoughlin","raw_affiliation_strings":["University Of Science And Technology Of China,National Engineering Research Centre of Speech and Language Information Processing,Hefei,China","National Engineering Research Centre of Speech and Language Information Processing, University Of Science And Technology Of China, Hefei, China","ICT Cluster, Singapore Institute of Technology, Singapore"],"affiliations":[{"raw_affiliation_string":"University Of Science And Technology Of China,National Engineering Research Centre of Speech and Language Information Processing,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Research Centre of Speech and Language Information Processing, University Of Science And Technology Of China, Hefei, China","institution_ids":[]},{"raw_affiliation_string":"ICT Cluster, Singapore Institute of Technology, Singapore","institution_ids":["https://openalex.org/I168639165"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101814613","display_name":"Xin Fang","orcid":"https://orcid.org/0009-0001-6003-9439"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xin Fang","raw_affiliation_strings":["iFLYTEK Co. Ltd,iFLYTEK Research,Hefei,China","iFLYTEK Research, iFLYTEK Co. Ltd, Hefei, China"],"affiliations":[{"raw_affiliation_string":"iFLYTEK Co. Ltd,iFLYTEK Research,Hefei,China","institution_ids":[]},{"raw_affiliation_string":"iFLYTEK Research, iFLYTEK Co. Ltd, Hefei, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100383342","display_name":"Lin Liu","orcid":"https://orcid.org/0000-0003-2843-5738"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin Liu","raw_affiliation_strings":["iFLYTEK Co. Ltd,iFLYTEK Research,Hefei,China","iFLYTEK Research, iFLYTEK Co. Ltd, Hefei, China"],"affiliations":[{"raw_affiliation_string":"iFLYTEK Co. Ltd,iFLYTEK Research,Hefei,China","institution_ids":[]},{"raw_affiliation_string":"iFLYTEK Research, iFLYTEK Co. Ltd, Hefei, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100386630"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":4.8863,"has_fulltext":false,"cited_by_count":25,"citation_normalized_percentile":{"value":0.96166687,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9923999905586243,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.750536322593689},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7328392863273621},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7009485960006714},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6458755731582642},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.606979250907898},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.40201446413993835},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3600956201553345},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.12196385860443115},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.08526363968849182}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.750536322593689},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7328392863273621},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7009485960006714},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6458755731582642},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.606979250907898},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40201446413993835},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3600956201553345},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.12196385860443115},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.08526363968849182},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096853","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096853","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:figshare.com:article/24217350","is_oa":true,"landing_page_url":"https://figshare.com/articles/conference_contribution/AST-SED_An_Effective_Sound_Event_Detection_Method_Based_on_Audio_Spectrogram_Transformer/24217350","pdf_url":null,"source":{"id":"https://openalex.org/S4377196282","display_name":"Figshare","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210132348","host_organization_name":"Figshare (United Kingdom)","host_organization_lineage":["https://openalex.org/I4210132348"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Text"}],"best_oa_location":{"id":"pmh:oai:figshare.com:article/24217350","is_oa":true,"landing_page_url":"https://figshare.com/articles/conference_contribution/AST-SED_An_Effective_Sound_Event_Detection_Method_Based_on_Audio_Spectrogram_Transformer/24217350","pdf_url":null,"source":{"id":"https://openalex.org/S4377196282","display_name":"Figshare","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210132348","host_organization_name":"Figshare (United Kingdom)","host_organization_lineage":["https://openalex.org/I4210132348"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2135342008","https://openalex.org/W2157331557","https://openalex.org/W2591013610","https://openalex.org/W2593116425","https://openalex.org/W2624241755","https://openalex.org/W2765407302","https://openalex.org/W2908510526","https://openalex.org/W2953070460","https://openalex.org/W2959539607","https://openalex.org/W2963099423","https://openalex.org/W2963610932","https://openalex.org/W3015190346","https://openalex.org/W3094502228","https://openalex.org/W3094550259","https://openalex.org/W3162400960","https://openalex.org/W3170874841","https://openalex.org/W3196645599","https://openalex.org/W3196974791","https://openalex.org/W3203468141","https://openalex.org/W3206996142","https://openalex.org/W4221149441","https://openalex.org/W4225706374","https://openalex.org/W6733814495","https://openalex.org/W6745136726","https://openalex.org/W6784333009","https://openalex.org/W6788135285"],"related_works":["https://openalex.org/W2530685530","https://openalex.org/W2011227383","https://openalex.org/W4375868962","https://openalex.org/W2088854863","https://openalex.org/W1976719989","https://openalex.org/W2942893872","https://openalex.org/W2065606036","https://openalex.org/W3179495260","https://openalex.org/W2897924318","https://openalex.org/W2138997758"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3],"propose":[4],"an":[5,82],"effective":[6,87],"sound":[7],"event":[8],"detection":[9,185],"(SED)":[10],"method":[11],"based":[12],"on":[13,21,39,159],"the":[14,22,60,78,97,101,113,152,166,169,176],"audio":[15,26,120],"spectrogram":[16],"transformer":[17],"(AST)":[18],"model,":[19],"pretrained":[20,74,153,196],"large-scale":[23],"AudioSet":[24],"for":[25,147],"tagging":[27],"(AT)":[28],"task,":[29],"termed":[30],"AST-SED.":[31],"Pretrained":[32],"AST":[33,75,98,154],"models":[34],"have":[35,164],"recently":[36],"shown":[37],"promise":[38],"DCASE2022":[40,160],"challenge":[41],"task4":[42,161],"where":[43],"they":[44],"help":[45],"mitigate":[46],"a":[47,73,124],"lack":[48],"of":[49,107,135,168,180,189],"sufficient":[50],"real":[51],"annotated":[52],"data.":[53],"However,":[54],"mainly":[55],"due":[56],"to":[57,68,85,93,116,145],"differences":[58],"between":[59],"AT":[61],"and":[62,88,139,182,194],"SED":[63],"tasks,":[64],"it":[65],"is":[66],"suboptimal":[67],"directly":[69],"utilize":[70],"outputs":[71],"from":[72],"model.":[76,99],"Hence":[77],"proposed":[79,170],"AST-SED":[80,171],"adopts":[81],"encoder-decoder":[83],"architecture":[84],"enable":[86],"efficient":[89],"fine-tuning":[90],"without":[91],"needing":[92],"redesign":[94],"or":[95],"retrain":[96],"Specifically,":[100,175],"Frequency-wise":[102],"Transformer":[103],"Encoder":[104],"(FTE)":[105],"consists":[106,134],"transformers":[108],"with":[109,172],"self":[110],"attention":[111],"along":[112],"frequency":[114],"axis":[115],"address":[117],"multiple":[118],"overlapped":[119],"events":[121],"issue":[122],"in":[123,151],"single":[125],"clip.":[126],"The":[127],"Local":[128],"Gated":[129,141],"Recurrent":[130,142],"Units":[131,143],"Decoder":[132],"(LGD)":[133],"nearest-neighbor":[136],"interpolation":[137],"(NNI)":[138],"Bidirectional":[140],"(Bi-GRU)":[144],"compensate":[146],"temporal":[148],"resolution":[149],"loss":[150],"model":[155],"output.":[156],"Experimental":[157],"results":[158],"development":[162],"set":[163],"demonstrated":[165],"superiority":[167],"FTE-LGD":[173],"architecture.":[174],"Event-Based":[177],"F1-score":[178],"(EB-F1)":[179],"59.60%":[181],"Polyphonic":[183],"Sound":[184],"Score":[186],"scenario1":[187],"(PSDS1)":[188],"0.5140":[190],"significantly":[191],"outperform":[192],"CRNN":[193],"other":[195],"AST-based":[197],"systems.":[198]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":11}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-10-10T00:00:00"}
