{"id":"https://openalex.org/W3015597694","doi":"https://doi.org/10.1109/icassp40776.2020.9053427","title":"Weakly Labelled Audio Tagging Via Convolutional Networks with Spatial and Channel-Wise Attention","display_name":"Weakly Labelled Audio Tagging Via Convolutional Networks with Spatial and Channel-Wise Attention","publication_year":2020,"publication_date":"2020-04-09","ids":{"openalex":"https://openalex.org/W3015597694","doi":"https://doi.org/10.1109/icassp40776.2020.9053427","mag":"3015597694"},"language":"en","primary_location":{"id":"doi:10.1109/icassp40776.2020.9053427","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053427","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083633440","display_name":"Sixin Hong","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Sixin Hong","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002795838","display_name":"Yuexian Zou","orcid":"https://orcid.org/0000-0001-9999-6140"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuexian Zou","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100676721","display_name":"Wenwu Wang","orcid":"https://orcid.org/0000-0002-8393-5703"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wenwu Wang","raw_affiliation_strings":["Center for Vision, Speech and Signal Processing, University of Surrey, UK"],"affiliations":[{"raw_affiliation_string":"Center for Vision, Speech and Signal Processing, University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067002890","display_name":"Meng Cao","orcid":"https://orcid.org/0000-0002-8946-4228"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Meng Cao","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5083633440"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":1.825,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.85558814,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"296","last_page":"300"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9743000268936157,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9520000219345093,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8016188740730286},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.7529605627059937},{"id":"https://openalex.org/keywords/offset","display_name":"Offset (computer science)","score":0.655268132686615},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.5863984227180481},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5492439866065979},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5368054509162903},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.46704453229904175},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.41326141357421875},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.408816933631897},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09847059845924377},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.08368697762489319}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8016188740730286},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.7529605627059937},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.655268132686615},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.5863984227180481},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5492439866065979},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5368054509162903},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.46704453229904175},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.41326141357421875},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.408816933631897},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09847059845924377},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.08368697762489319},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/icassp40776.2020.9053427","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053427","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:alma.44SUR_INST:11139661710002346","is_oa":false,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4210197018","display_name":"View","issn_l":"2688-268X","issn":["2688-268X","2688-3988"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320595","host_organization_name":"Wiley","host_organization_lineage":["https://openalex.org/P4310320595"],"host_organization_lineage_names":["Wiley"],"type":"journal"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":""},{"id":"pmh:oai:epubs.surrey.ac.uk:853802","is_oa":false,"landing_page_url":"http://epubs.surrey.ac.uk/853802/1/HongZWC_ICASSP_2020.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400680","display_name":"Surrey Research Insight Open Access (The University of Surrey)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I28290843","host_organization_name":"University of Surrey","host_organization_lineage":["https://openalex.org/I28290843"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Conference or Workshop Item"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1836465849","https://openalex.org/W2062388453","https://openalex.org/W2354870669","https://openalex.org/W2414894569","https://openalex.org/W2526050071","https://openalex.org/W2593116425","https://openalex.org/W2604490051","https://openalex.org/W2735263571","https://openalex.org/W2752782242","https://openalex.org/W2775505379","https://openalex.org/W2791956393","https://openalex.org/W2798579254","https://openalex.org/W2883651221","https://openalex.org/W2885392124","https://openalex.org/W2889351461","https://openalex.org/W2937077415","https://openalex.org/W2949117887","https://openalex.org/W2963022469","https://openalex.org/W2963100687","https://openalex.org/W2963420686","https://openalex.org/W2963610932","https://openalex.org/W2963715927","https://openalex.org/W2964891022","https://openalex.org/W3103314642","https://openalex.org/W3123416659","https://openalex.org/W4297686742","https://openalex.org/W6638667902","https://openalex.org/W6715395060","https://openalex.org/W6741024066","https://openalex.org/W6749158954","https://openalex.org/W6749977921","https://openalex.org/W6750883802","https://openalex.org/W6754161993"],"related_works":["https://openalex.org/W3093612317","https://openalex.org/W2175746458","https://openalex.org/W2732542196","https://openalex.org/W2760085659","https://openalex.org/W2883200793","https://openalex.org/W2738221750","https://openalex.org/W3012978760","https://openalex.org/W2912288872","https://openalex.org/W2940661641","https://openalex.org/W2758063741"],"abstract_inverted_index":{"Multiple":[0],"instance":[1],"learning":[2],"(MIL)":[3],"with":[4,67,114,134],"convolutional":[5],"neural":[6],"networks":[7],"(CNNs)":[8],"has":[9],"been":[10],"proposed":[11,105,132],"recently":[12],"for":[13],"weakly":[14,125],"labelled":[15,126],"audio":[16],"tagging.":[17],"However,":[18],"features":[19],"from":[20],"the":[21,68,72,77,81,86,89,131],"various":[22],"CNN":[23,92],"filtering":[24],"channels":[25],"and":[26,52,64,76,83,117],"spatial":[27,51,57,74],"regions":[28,75],"are":[29],"often":[30],"treated":[31],"equally,":[32],"which":[33],"may":[34],"limit":[35],"its":[36],"performance":[37],"in":[38,91],"event":[39],"prediction.":[40],"In":[41],"this":[42],"paper,":[43],"we":[44,59],"propose":[45],"a":[46,137],"novel":[47],"attention":[48,54,95],"mechanism,":[49],"namely,":[50],"channel-wise":[53,94],"(SCA).":[55],"For":[56],"attention,":[58],"divide":[60],"it":[61],"into":[62,110],"global":[63],"local":[65],"submodules":[66],"former":[69],"to":[70,79,99],"capture":[71],"event-related":[73],"latter":[78],"estimate":[80],"onset":[82],"offset":[84],"of":[85,143],"events.":[87],"Considering":[88],"variations":[90],"channels,":[93],"is":[96,118],"also":[97],"exploited":[98],"recognize":[100],"different":[101],"sound":[102],"scenes.":[103],"The":[104],"SCA":[106,133],"can":[107],"be":[108],"employed":[109],"any":[111],"CNNs":[112,135],"seamlessly":[113],"affordable":[115],"overheads":[116],"end-to-end":[119],"trainable":[120],"fashion.":[121],"Extensive":[122],"experiments":[123],"on":[124],"dataset":[127],"Audioset":[128],"show":[129],"that":[130],"achieves":[136],"state-of-the-art":[138],"mean":[139],"average":[140],"precision":[141],"(mAP)":[142],"0.390.":[144]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":6},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":1}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
