{"id":"https://openalex.org/W4281663607","doi":"https://doi.org/10.1109/jstsp.2022.3180592","title":"Self-Supervised Learning of Audio Representations From Audio-Visual Data Using Spatial Alignment","display_name":"Self-Supervised Learning of Audio Representations From Audio-Visual Data Using Spatial Alignment","publication_year":2022,"publication_date":"2022-06-08","ids":{"openalex":"https://openalex.org/W4281663607","doi":"https://doi.org/10.1109/jstsp.2022.3180592"},"language":"en","primary_location":{"id":"doi:10.1109/jstsp.2022.3180592","is_oa":true,"landing_page_url":"https://doi.org/10.1109/jstsp.2022.3180592","pdf_url":"https://ieeexplore.ieee.org/ielx7/4200690/9923627/09790080.pdf","source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://ieeexplore.ieee.org/ielx7/4200690/9923627/09790080.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100406235","display_name":"Shanshan Wang","orcid":"https://orcid.org/0000-0001-9530-6958"},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]}],"countries":["FI"],"is_corresponding":true,"raw_author_name":"Shanshan Wang","raw_affiliation_strings":["Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland","Faculty of Information Technology and Com-munication Sciences, Tampere University, 33100 Tampere, Finland"],"raw_orcid":"https://orcid.org/0000-0001-9530-6958","affiliations":[{"raw_affiliation_string":"Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland","institution_ids":["https://openalex.org/I166825849"]},{"raw_affiliation_string":"Faculty of Information Technology and Com-munication Sciences, Tampere University, 33100 Tampere, Finland","institution_ids":["https://openalex.org/I166825849"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010846139","display_name":"Archontis Politis","orcid":"https://orcid.org/0000-0002-0595-2356"},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Archontis Politis","raw_affiliation_strings":["Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland","institution_ids":["https://openalex.org/I166825849"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079981416","display_name":"Annamaria Mesaros","orcid":"https://orcid.org/0000-0002-6640-9752"},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Annamaria Mesaros","raw_affiliation_strings":["Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland"],"raw_orcid":"https://orcid.org/0000-0002-6640-9752","affiliations":[{"raw_affiliation_string":"Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland","institution_ids":["https://openalex.org/I166825849"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049691461","display_name":"Tuomas Virtanen","orcid":"https://orcid.org/0000-0002-4604-9729"},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Tuomas Virtanen","raw_affiliation_strings":["Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland"],"raw_orcid":"https://orcid.org/0000-0002-4604-9729","affiliations":[{"raw_affiliation_string":"Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland","institution_ids":["https://openalex.org/I166825849"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100406235"],"corresponding_institution_ids":["https://openalex.org/I166825849"],"apc_list":null,"apc_paid":null,"fwci":2.3807,"has_fulltext":true,"cited_by_count":16,"citation_normalized_percentile":{"value":0.88592395,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"16","issue":"6","first_page":"1467","last_page":"1479"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.9724000096321106,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/ambisonics","display_name":"Ambisonics","score":0.917667031288147},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8310254812240601},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.596993088722229},{"id":"https://openalex.org/keywords/binaural-recording","display_name":"Binaural recording","score":0.5611438751220703},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5362875461578369},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.43770214915275574},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.42468133568763733},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4219532608985901},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3722943663597107},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.18942168354988098},{"id":"https://openalex.org/keywords/loudspeaker","display_name":"Loudspeaker","score":0.16661697626113892},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.14548298716545105}],"concepts":[{"id":"https://openalex.org/C47726159","wikidata":"https://www.wikidata.org/wiki/Q457547","display_name":"Ambisonics","level":3,"score":0.917667031288147},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8310254812240601},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.596993088722229},{"id":"https://openalex.org/C201247586","wikidata":"https://www.wikidata.org/wiki/Q5612967","display_name":"Binaural recording","level":2,"score":0.5611438751220703},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5362875461578369},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.43770214915275574},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.42468133568763733},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4219532608985901},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3722943663597107},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.18942168354988098},{"id":"https://openalex.org/C157138929","wikidata":"https://www.wikidata.org/wiki/Q570","display_name":"Loudspeaker","level":2,"score":0.16661697626113892},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.14548298716545105},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/jstsp.2022.3180592","is_oa":true,"landing_page_url":"https://doi.org/10.1109/jstsp.2022.3180592","pdf_url":"https://ieeexplore.ieee.org/ielx7/4200690/9923627/09790080.pdf","source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2206.00970","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2206.00970","pdf_url":"https://arxiv.org/pdf/2206.00970","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:trepo.tuni.fi:10024/222892","is_oa":true,"landing_page_url":"https://trepo.tuni.fi/handle/10024/222892","pdf_url":null,"source":{"id":"https://openalex.org/S7407055260","display_name":"Trepo - Institutional Repository of Tampere University","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.1109/jstsp.2022.3180592","is_oa":true,"landing_page_url":"https://doi.org/10.1109/jstsp.2022.3180592","pdf_url":"https://ieeexplore.ieee.org/ielx7/4200690/9923627/09790080.pdf","source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2360061192","display_name":"Teaching machines to listen","funder_award_id":"332063","funder_id":"https://openalex.org/F4320321108","funder_display_name":"Academy of Finland"},{"id":"https://openalex.org/G4923903852","display_name":null,"funder_award_id":"332063","funder_id":"https://openalex.org/F4320321108","funder_display_name":"Academy of Finland"}],"funders":[{"id":"https://openalex.org/F4320321108","display_name":"Academy of Finland","ror":"https://ror.org/05k73zm37"},{"id":"https://openalex.org/F4320322725","display_name":"China Scholarship Council","ror":"https://ror.org/04atp4p48"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4281663607.pdf","grobid_xml":"https://content.openalex.org/works/W4281663607.grobid-xml"},"referenced_works_count":47,"referenced_works":["https://openalex.org/W24089286","https://openalex.org/W1966081805","https://openalex.org/W2052666245","https://openalex.org/W2086384421","https://openalex.org/W2117539524","https://openalex.org/W2126579184","https://openalex.org/W2204091679","https://openalex.org/W2283105982","https://openalex.org/W2296073425","https://openalex.org/W2619697695","https://openalex.org/W2761379173","https://openalex.org/W2770337804","https://openalex.org/W2900893004","https://openalex.org/W2917254586","https://openalex.org/W2939574508","https://openalex.org/W2939726645","https://openalex.org/W2942551338","https://openalex.org/W2962960500","https://openalex.org/W2963037989","https://openalex.org/W2963155035","https://openalex.org/W2963680395","https://openalex.org/W2963859210","https://openalex.org/W2963902314","https://openalex.org/W2964306713","https://openalex.org/W2989980422","https://openalex.org/W2997340076","https://openalex.org/W3022281166","https://openalex.org/W3034742263","https://openalex.org/W3049847664","https://openalex.org/W3083274258","https://openalex.org/W3098454764","https://openalex.org/W3117314925","https://openalex.org/W3137857706","https://openalex.org/W3158504903","https://openalex.org/W3161541317","https://openalex.org/W3188558905","https://openalex.org/W4236344233","https://openalex.org/W4297808394","https://openalex.org/W6600983433","https://openalex.org/W6739901393","https://openalex.org/W6774314701","https://openalex.org/W6776700526","https://openalex.org/W6785011006","https://openalex.org/W6785591002","https://openalex.org/W6795710243","https://openalex.org/W6807232479","https://openalex.org/W6844194202"],"related_works":["https://openalex.org/W3083538027","https://openalex.org/W4389102310","https://openalex.org/W3210854820","https://openalex.org/W2187779179","https://openalex.org/W2889406797","https://openalex.org/W2964758937","https://openalex.org/W3129121609","https://openalex.org/W3126653626","https://openalex.org/W2769108308","https://openalex.org/W4210460202"],"abstract_inverted_index":{"Learning":[0],"from":[1,62,202],"audio-visual":[2,40,51],"data":[3],"offers":[4],"many":[5],"possibilities":[6],"to":[7,17,56,102,122,195],"express":[8],"correspondence":[9,52],"between":[10,107],"the":[11,18,50,57,63,94,98,104,110,116,124,144,157,168,184,187],"audio":[12,95,120,125,129,189],"and":[13,24,68,79,91,109,127,133,204],"visual":[14,25,69,86],"content,":[15],"similar":[16],"human":[19,169],"perception":[20],"that":[21],"relates":[22],"aural":[23],"information.":[26],"In":[27,54],"this":[28],"work,":[29],"we":[30,82],"present":[31],"a":[32,44,138],"method":[33],"for":[34,143,167,182],"self-supervised":[35],"representation":[36],"learning":[37],"based":[38],"on":[39,72,141,198],"spatial":[41,64,105,119],"alignment":[42,47,106],"(AVSA),":[43],"more":[45],"sophisticated":[46],"task":[48],"than":[49],"(AVC).":[53],"addition":[55,158],"correspondence,":[58],"AVSA":[59,142],"also":[60,162],"learns":[61],"location":[65],"of":[66,85,93,118,159,176,186],"acoustic":[67,199],"content.":[70],"Based":[71],"360":[73],"<inline-formula":[74],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[75],"xmlns:xlink=\"http://www.w3.org/1999/xlink\"><tex-math":[76],"notation=\"LaTeX\">$^\\circ$</tex-math></inline-formula>":[77],"video":[78],"Ambisonics":[80],"audio,":[81],"propose":[83],"selection":[84],"objects":[87,108],"using":[88],"object":[89],"detection,":[90],"beamforming":[92],"signal":[96],"towards":[97],"detected":[99],"objects,":[100],"attempting":[101],"learn":[103],"sound":[111],"they":[112],"produce.":[113],"We":[114],"investigate":[115],"use":[117],"features":[121],"represent":[123],"input,":[126],"different":[128],"formats:":[130],"Ambisonics,":[131],"mono,":[132],"stereo.":[134],"Experimental":[135],"results":[136],"show":[137],"10%":[139],"improvement":[140],"first":[145],"order":[146],"ambisonics":[147],"intensity":[148],"vector":[149],"(FOA-IV)":[150],"in":[151],"comparison":[152],"with":[153],"log-mel":[154],"spectrogram":[155],"features;":[156],"object-oriented":[160],"crops":[161],"brings":[163],"significant":[164],"performance":[165,193],"increases":[166],"action":[170],"recognition":[171],"downstream":[172,178],"task.":[173],"A":[174],"number":[175],"audio-only":[177],"tasks":[179],"are":[180],"devised":[181],"testing":[183],"effectiveness":[185],"learnt":[188],"feature":[190],"representation,":[191],"obtaining":[192],"comparable":[194],"state-of-the-art":[196],"methods":[197],"scene":[200],"classification":[201],"ambisonic":[203],"binaural":[205],"audio.":[206]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
