{"id":"https://openalex.org/W4401610145","doi":"https://doi.org/10.1109/icasspw62465.2024.10626141","title":"Positive and Negative Sampling Strategies for Self-Supervised Learning on Audio-Video Data","display_name":"Positive and Negative Sampling Strategies for Self-Supervised Learning on Audio-Video Data","publication_year":2024,"publication_date":"2024-04-14","ids":{"openalex":"https://openalex.org/W4401610145","doi":"https://doi.org/10.1109/icasspw62465.2024.10626141"},"language":"en","primary_location":{"id":"doi:10.1109/icasspw62465.2024.10626141","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icasspw62465.2024.10626141","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100406235","display_name":"Shanshan Wang","orcid":"https://orcid.org/0000-0001-9530-6958"},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]}],"countries":["FI"],"is_corresponding":true,"raw_author_name":"Shanshan Wang","raw_affiliation_strings":["Tampere University,Signal Processing Research Centre,Tampere,Finland"],"affiliations":[{"raw_affiliation_string":"Tampere University,Signal Processing Research Centre,Tampere,Finland","institution_ids":["https://openalex.org/I166825849"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113833059","display_name":"Soumya Tripathy","orcid":null},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Soumya Tripathy","raw_affiliation_strings":["Tampere University,Signal Processing Research Centre,Tampere,Finland"],"affiliations":[{"raw_affiliation_string":"Tampere University,Signal Processing Research Centre,Tampere,Finland","institution_ids":["https://openalex.org/I166825849"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059929826","display_name":"Toni Heittola","orcid":"https://orcid.org/0000-0002-8855-0415"},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Toni Heittola","raw_affiliation_strings":["Tampere University,Signal Processing Research Centre,Tampere,Finland"],"affiliations":[{"raw_affiliation_string":"Tampere University,Signal Processing Research Centre,Tampere,Finland","institution_ids":["https://openalex.org/I166825849"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5079981416","display_name":"Annamaria Mesaros","orcid":"https://orcid.org/0000-0002-6640-9752"},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Annamaria Mesaros","raw_affiliation_strings":["Tampere University,Signal Processing Research Centre,Tampere,Finland"],"affiliations":[{"raw_affiliation_string":"Tampere University,Signal Processing Research Centre,Tampere,Finland","institution_ids":["https://openalex.org/I166825849"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100406235"],"corresponding_institution_ids":["https://openalex.org/I166825849"],"apc_list":null,"apc_paid":null,"fwci":0.739,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.68691836,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":95},"biblio":{"volume":"33","issue":null,"first_page":"545","last_page":"549"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9158999919891357,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9158999919891357,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7868621945381165},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.5664427280426025},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.476496160030365},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.391417533159256},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.380531907081604},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.3668663501739502},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.15929871797561646}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7868621945381165},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.5664427280426025},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.476496160030365},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.391417533159256},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.380531907081604},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3668663501739502},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.15929871797561646},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icasspw62465.2024.10626141","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icasspw62465.2024.10626141","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"},{"id":"pmh:oai:trepo.tuni.fi:10024/225996","is_oa":false,"landing_page_url":"https://trepo.tuni.fi/handle/10024/225996","pdf_url":null,"source":{"id":"https://openalex.org/S7407055260","display_name":"Trepo - Institutional Repository of Tampere University","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"conference"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W2108598243","https://openalex.org/W2593116425","https://openalex.org/W2842511635","https://openalex.org/W2890052321","https://openalex.org/W2955425717","https://openalex.org/W2962960500","https://openalex.org/W2963155035","https://openalex.org/W2963859210","https://openalex.org/W3015371781","https://openalex.org/W3137857706","https://openalex.org/W3161541317","https://openalex.org/W3196436997","https://openalex.org/W4281663607","https://openalex.org/W4287608901","https://openalex.org/W4295312788","https://openalex.org/W4297808394","https://openalex.org/W4372269994","https://openalex.org/W4372347384","https://openalex.org/W6754048563","https://openalex.org/W6762718338","https://openalex.org/W6766978945","https://openalex.org/W6774314701","https://openalex.org/W6785011006"],"related_works":["https://openalex.org/W2961085424","https://openalex.org/W4306674287","https://openalex.org/W3046775127","https://openalex.org/W3107602296","https://openalex.org/W4394896187","https://openalex.org/W3170094116","https://openalex.org/W4386462264","https://openalex.org/W4364306694","https://openalex.org/W4312192474","https://openalex.org/W4283697347"],"abstract_inverted_index":{"In":[0,90],"Self-Supervised":[1],"Learning":[2],"(SSL),":[3],"Audio-Visual":[4],"Correspondence":[5],"(AVC)":[6],"is":[7,27,87,151,174],"a":[8,134,160,222],"popular":[9],"task":[10],"to":[11,28,40,68,124,232],"learn":[12,39],"deep":[13],"audio":[14,31,149],"and":[15,32,38,112,201,205,227],"video":[16,33,161],"features":[17,65,179],"from":[18,35,153,159,189],"large":[19],"unlabeled":[20],"datasets.":[21],"The":[22,63],"key":[23],"step":[24],"in":[25,102,176,198],"AVC":[26,104],"randomly":[29],"sample":[30,150],"clips":[34],"the":[36,42,46,54,57,82,85,95,103,126,129,144,154,163,171,182,190,234],"dataset":[37,86,172],"minimize":[41],"feature":[43,105,130],"distance":[44,55],"between":[45,56],"positive":[47,111,145],"pairs":[48,59],"(corresponding":[49],"audio-video":[50,61],"pair)":[51],"while":[52],"maximizing":[53],"negative":[58,113],"(non-corresponding":[60],"pairs).":[62],"learnt":[64,180],"are":[66],"shown":[67],"be":[69,219,230],"effective":[70],"on":[71,120,128],"various":[72,110],"downstream":[73,203],"tasks.":[74],"However,":[75],"these":[76],"methods":[77],"achieve":[78],"subpar":[79],"performance":[80,236],"when":[81,170],"size":[83,173],"of":[84,97,117,162],"rather":[88],"small.":[89],"this":[91],"paper,":[92],"we":[93,139],"investigate":[94,125],"effect":[96,127],"utilizing":[98],"class":[99,121,214],"label":[100,122,215],"information":[101,123,216],"learning":[106],"task.":[107],"We":[108,132],"modified":[109],"data":[114],"sampling":[115,136,184,193],"techniques":[116],"SSL":[118,177,192,235],"based":[119],"quality.":[131],"propose":[133],"new":[135],"approach":[137],"which":[138],"call":[140],"soft-positive":[141,183],"sampling,":[142],"where":[143],"pair":[146],"for":[147],"one":[148],"not":[152],"exact":[155],"corresponding":[156],"video,":[157],"but":[158],"same":[164],"class.":[165],"Experimental":[166],"results":[167],"suggest":[168],"that":[169,213],"small":[175],"setup,":[178],"through":[181],"method":[185],"significantly":[186],"outperform":[187],"those":[188],"traditional":[191],"approaches.":[194],"This":[195],"trend":[196],"holds":[197],"both":[199],"in-domain":[200],"out-of-domain":[202],"tasks,":[204],"even":[206],"outperforms":[207],"supervised":[208],"classification.":[209],"Finally,":[210],"experiments":[211],"show":[212],"can":[217,229],"easily":[218],"obtained":[220],"using":[221],"publicly":[223],"available":[224],"classifier":[225],"network":[226],"then":[228],"used":[231],"boost":[233],"without":[237],"adding":[238],"extra":[239],"data-":[240],"annotation":[241],"burden.":[242]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-06T13:50:29.536080","created_date":"2025-10-10T00:00:00"}
