{"id":"https://openalex.org/W4386590516","doi":"https://doi.org/10.1109/icip49359.2023.10222383","title":"Self-Supervised Contrastive Learning for Audio-Visual Action Recognition","display_name":"Self-Supervised Contrastive Learning for Audio-Visual Action Recognition","publication_year":2023,"publication_date":"2023-09-11","ids":{"openalex":"https://openalex.org/W4386590516","doi":"https://doi.org/10.1109/icip49359.2023.10222383"},"language":"en","primary_location":{"id":"doi:10.1109/icip49359.2023.10222383","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icip49359.2023.10222383","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Image Processing (ICIP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100356036","display_name":"Yang Liu","orcid":"https://orcid.org/0000-0002-9423-9252"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yang Liu","raw_affiliation_strings":["Sun Yat-Sen University,School of Computer Science and Engineering","School of Computer Science and Engineering, Sun Yat-Sen University"],"affiliations":[{"raw_affiliation_string":"Sun Yat-Sen University,School of Computer Science and Engineering","institution_ids":["https://openalex.org/I157773358"]},{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-Sen University","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023089209","display_name":"Ying Tan","orcid":"https://orcid.org/0000-0001-8243-4731"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Tan","raw_affiliation_strings":["Sun Yat-Sen University,School of Computer Science and Engineering","School of Computer Science and Engineering, Sun Yat-Sen University"],"affiliations":[{"raw_affiliation_string":"Sun Yat-Sen University,School of Computer Science and Engineering","institution_ids":["https://openalex.org/I157773358"]},{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-Sen University","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036024481","display_name":"Haoyuan Lan","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoyuan Lan","raw_affiliation_strings":["Sun Yat-Sen University,School of Computer Science and Engineering","School of Computer Science and Engineering, Sun Yat-Sen University"],"affiliations":[{"raw_affiliation_string":"Sun Yat-Sen University,School of Computer Science and Engineering","institution_ids":["https://openalex.org/I157773358"]},{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-Sen University","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100356036"],"corresponding_institution_ids":["https://openalex.org/I157773358"],"apc_list":null,"apc_paid":null,"fwci":0.6174,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.69762325,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1000","last_page":"1004"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7990787625312805},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.624840259552002},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5880597829818726},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.5664977431297302},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.4882838726043701},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4767911434173584},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3565606474876404},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3372431993484497},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.27817562222480774},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.05513441562652588}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7990787625312805},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.624840259552002},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5880597829818726},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.5664977431297302},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.4882838726043701},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4767911434173584},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3565606474876404},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3372431993484497},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.27817562222480774},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.05513441562652588},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icip49359.2023.10222383","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icip49359.2023.10222383","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Image Processing (ICIP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1522734439","https://openalex.org/W2619697695","https://openalex.org/W2619947201","https://openalex.org/W2948242301","https://openalex.org/W2963115079","https://openalex.org/W2963155035","https://openalex.org/W2963247196","https://openalex.org/W2964037671","https://openalex.org/W2971680695","https://openalex.org/W3015949486","https://openalex.org/W3034381931","https://openalex.org/W3034658206","https://openalex.org/W3035524453","https://openalex.org/W3047425522","https://openalex.org/W3048939150","https://openalex.org/W3099638501","https://openalex.org/W3101999878","https://openalex.org/W3171007011","https://openalex.org/W4213304546","https://openalex.org/W4225674167","https://openalex.org/W4287608901","https://openalex.org/W4308455108","https://openalex.org/W4379929708","https://openalex.org/W6682948231","https://openalex.org/W6774314701","https://openalex.org/W6779326418","https://openalex.org/W6785011006","https://openalex.org/W6791742336","https://openalex.org/W6955071965"],"related_works":["https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W3137890128","https://openalex.org/W1984634519","https://openalex.org/W1576128429","https://openalex.org/W2269464716"],"abstract_inverted_index":{"The":[0],"underlying":[1],"correlation":[2],"between":[3],"audio":[4,52],"and":[5,53,103],"visual":[6,54],"modalities":[7],"can":[8],"be":[9],"utilized":[10],"to":[11,32,50],"learn":[12,33,72],"supervised":[13,73],"information":[14,74],"for":[15,37],"unlabeled":[16,76],"videos.":[17],"In":[18],"this":[19],"paper,":[20],"we":[21,41,61,78,88],"propose":[22,79],"an":[23,43],"end-to-end":[24],"self-supervised":[25,82],"framework":[26],"named":[27,96],"Audio-Visual":[28],"Contrastive":[29],"Learning":[30],"(AVCL),":[31],"discriminative":[34],"audio-visual":[35,59,92],"representations":[36],"action":[38,93,118],"recognition.":[39],"Specifically,":[40],"design":[42],"attention":[44],"based":[45],"multi-modal":[46],"fusion":[47],"module":[48,69,85],"(AMFM)":[49],"fuse":[51],"modalities.":[55],"To":[56,71],"align":[57],"heterogeneous":[58],"modalities,":[60],"construct":[62],"a":[63,80,90],"novel":[64,81],"co-correlation":[65],"guided":[66],"representation":[67],"alignment":[68],"(CGRA).":[70],"from":[75],"videos,":[77],"contrastive":[83],"learning":[84],"(SelfCL).":[86],"Furthermore,":[87],"build":[89],"new":[91],"recognition":[94,119],"dataset":[95],"Kinetics-Sounds100.":[97],"Experimental":[98],"results":[99],"on":[100,116],"the":[101,107,113],"Kinetics-Sounds32":[102],"Kinetics-Sounds100":[104],"datasets":[105],"demonstrate":[106],"superiority":[108],"of":[109],"our":[110],"AVCL":[111],"over":[112],"state-of-the-art":[114],"methods":[115],"large-scale":[117],"benchmark.":[120]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
