{"id":"https://openalex.org/W4372268280","doi":"https://doi.org/10.1109/icassp49357.2023.10096532","title":"Play It Back: Iterative Attention For Audio Recognition","display_name":"Play It Back: Iterative Attention For Audio Recognition","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372268280","doi":"https://doi.org/10.1109/icassp49357.2023.10096532"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096532","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096532","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://research-information.bris.ac.uk/en/publications/ae3902ae-bc3e-44df-a691-ecce43213f6a","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046369742","display_name":"Alexandros Stergiou","orcid":"https://orcid.org/0000-0003-4706-4231"},"institutions":[{"id":"https://openalex.org/I13469542","display_name":"Vrije Universiteit Brussel","ror":"https://ror.org/006e5kg04","country_code":"BE","type":"education","lineage":["https://openalex.org/I13469542"]},{"id":"https://openalex.org/I4210114974","display_name":"IMEC","ror":"https://ror.org/02kcbn207","country_code":"BE","type":"nonprofit","lineage":["https://openalex.org/I4210114974"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Alexandros Stergiou","raw_affiliation_strings":["Vrije University of Brussels,Belgium","Vrije University of Brussels, Belgium","Interuniversity Microelectronics Centre, Leuven, Belgium"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Vrije University of Brussels,Belgium","institution_ids":["https://openalex.org/I13469542"]},{"raw_affiliation_string":"Vrije University of Brussels, Belgium","institution_ids":["https://openalex.org/I13469542"]},{"raw_affiliation_string":"Interuniversity Microelectronics Centre, Leuven, Belgium","institution_ids":["https://openalex.org/I4210114974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003103666","display_name":"Dima Damen","orcid":"https://orcid.org/0000-0001-8804-6238"},"institutions":[{"id":"https://openalex.org/I36234482","display_name":"University of Bristol","ror":"https://ror.org/0524sp257","country_code":"GB","type":"education","lineage":["https://openalex.org/I36234482"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Dima Damen","raw_affiliation_strings":["University of Bristol,United Kingdom","University of Bristol, United Kingdom"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Bristol,United Kingdom","institution_ids":["https://openalex.org/I36234482"]},{"raw_affiliation_string":"University of Bristol, United Kingdom","institution_ids":["https://openalex.org/I36234482"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.9238,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.73207874,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9889000058174133,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.8463090658187866},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7582231760025024},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6689801216125488},{"id":"https://openalex.org/keywords/repetition","display_name":"Repetition (rhetorical device)","score":0.5172389149665833},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.48812660574913025},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4483800530433655},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.43741488456726074},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4291216731071472},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.42466267943382263},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3705209791660309},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.34044602513313293},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.20159423351287842}],"concepts":[{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.8463090658187866},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7582231760025024},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6689801216125488},{"id":"https://openalex.org/C2776141515","wikidata":"https://www.wikidata.org/wiki/Q1274479","display_name":"Repetition (rhetorical device)","level":2,"score":0.5172389149665833},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.48812660574913025},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4483800530433655},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.43741488456726074},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4291216731071472},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.42466267943382263},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3705209791660309},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.34044602513313293},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.20159423351287842},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096532","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096532","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:research-information.bris.ac.uk:openaire/ae3902ae-bc3e-44df-a691-ecce43213f6a","is_oa":true,"landing_page_url":"https://research-information.bris.ac.uk/en/publications/ae3902ae-bc3e-44df-a691-ecce43213f6a","pdf_url":null,"source":{"id":"https://openalex.org/S4306400895","display_name":"Bristol Research (University of Bristol)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I36234482","host_organization_name":"University of Bristol","host_organization_lineage":["https://openalex.org/I36234482"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Stergiou, A G & Damen, D 2023, Play It Back : Iterative Attention For Audio Recognition. in ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Institute of Electrical and Electronics Engineers (IEEE). https://doi.org/10.1109/ICASSP49357.2023.10096532","raw_type":"contributionToPeriodical"},{"id":"pmh:oai:vubissmart:VUBISSMART:2000:168705","is_oa":false,"landing_page_url":"https://biblio.vub.ac.be/vubir/play-it-back-iterative-attention-for-audio-recognition(8527b74a-d925-49f4-9f08-0661cae5ed72).html","pdf_url":null,"source":{"id":"https://openalex.org/S4306402573","display_name":"VUBIR (Vrije Universiteit Brussel)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I13469542","host_organization_name":"Vrije Universiteit Brussel","host_organization_lineage":["https://openalex.org/I13469542"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:research-information.bris.ac.uk:openaire/ae3902ae-bc3e-44df-a691-ecce43213f6a","is_oa":true,"landing_page_url":"https://research-information.bris.ac.uk/en/publications/ae3902ae-bc3e-44df-a691-ecce43213f6a","pdf_url":null,"source":{"id":"https://openalex.org/S4306400895","display_name":"Bristol Research (University of Bristol)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I36234482","host_organization_name":"University of Bristol","host_organization_lineage":["https://openalex.org/I36234482"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Stergiou, A G & Damen, D 2023, Play It Back : Iterative Attention For Audio Recognition. in ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Institute of Electrical and Electronics Engineers (IEEE). https://doi.org/10.1109/ICASSP49357.2023.10096532","raw_type":"contributionToPeriodical"},"sustainable_development_goals":[{"score":0.7400000095367432,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[{"id":"https://openalex.org/G397022313","display_name":"UMPIRE: United Model for the Perception of Interactions in visuoauditory REcognition","funder_award_id":"EP/T004991/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":46,"referenced_works":["https://openalex.org/W2123051636","https://openalex.org/W2132785096","https://openalex.org/W2194775991","https://openalex.org/W2298932978","https://openalex.org/W2593116425","https://openalex.org/W2737725206","https://openalex.org/W2765407302","https://openalex.org/W2889469641","https://openalex.org/W2955425717","https://openalex.org/W2962915133","https://openalex.org/W2963610932","https://openalex.org/W3015371781","https://openalex.org/W3015530480","https://openalex.org/W3037784242","https://openalex.org/W3049446265","https://openalex.org/W3094502228","https://openalex.org/W3094550259","https://openalex.org/W3097777922","https://openalex.org/W3138516171","https://openalex.org/W3163937874","https://openalex.org/W3169064633","https://openalex.org/W3170874841","https://openalex.org/W3174906557","https://openalex.org/W3205475937","https://openalex.org/W3205743929","https://openalex.org/W3207758636","https://openalex.org/W4226442948","https://openalex.org/W4285483774","https://openalex.org/W4297841853","https://openalex.org/W4309435465","https://openalex.org/W4312639100","https://openalex.org/W4312769131","https://openalex.org/W4375869340","https://openalex.org/W6678148215","https://openalex.org/W6697221206","https://openalex.org/W6745136726","https://openalex.org/W6762718338","https://openalex.org/W6779809370","https://openalex.org/W6784333009","https://openalex.org/W6788135285","https://openalex.org/W6790830454","https://openalex.org/W6797613833","https://openalex.org/W6802510933","https://openalex.org/W6803602085","https://openalex.org/W6810841414","https://openalex.org/W6811718501"],"related_works":["https://openalex.org/W2289868279","https://openalex.org/W2970176078","https://openalex.org/W4231351862","https://openalex.org/W4212794605","https://openalex.org/W4315836293","https://openalex.org/W4243888788","https://openalex.org/W1975359510","https://openalex.org/W3004352674","https://openalex.org/W2769884427","https://openalex.org/W2088690926"],"abstract_inverted_index":{"A":[0],"key":[1],"function":[2],"of":[3,9],"auditory":[4],"cognition":[5],"is":[6],"the":[7,28,49,54,61,68,79],"association":[8],"characteristic":[10],"sounds":[11,31,52],"with":[12],"their":[13,34],"corresponding":[14],"semantics":[15],"over":[16,48],"time.":[17],"Humans":[18],"attempting":[19],"to":[20,32],"discriminate":[21],"between":[22],"fine-grained":[23],"audio":[24,55,63],"categories,":[25],"often":[26],"replay":[27],"same":[29],"discriminative":[30,51],"increase":[33],"prediction":[35],"confidence.":[36],"We":[37,97],"propose":[38],"an":[39],"end-to-end":[40],"attention-based":[41],"architecture":[42],"that":[43,99],"through":[44],"selective":[45],"repetition":[46],"attends":[47],"most":[50],"across":[53,107],"sequence.":[56],"Our":[57],"model":[58],"initially":[59],"uses":[60],"full":[62],"sequence":[64],"and":[65,113],"iteratively":[66],"refines":[67],"temporal":[69],"segments":[70,81],"replayed":[71,83],"based":[72],"on":[73],"slot":[74],"attention.":[75],"At":[76],"each":[77],"playback,":[78],"selected":[80],"are":[82],"using":[84],"a":[85],"smaller":[86],"hop":[87],"length":[88],"which":[89],"represents":[90],"higher":[91],"resolution":[92],"features":[93],"within":[94],"these":[95],"segments.":[96],"show":[98],"our":[100],"method":[101],"can":[102],"consistently":[103],"achieve":[104],"state-of-the-art":[105],"performance":[106],"three":[108],"audio-classification":[109],"benchmarks:":[110],"AudioSet,":[111],"VGG-Sound,":[112],"EPIC-KITCHENS-100.":[114],"<sup":[115],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[116],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[117]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2023,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
