{"id":"https://openalex.org/W4224920427","doi":"https://doi.org/10.1109/icassp43922.2022.9747361","title":"Enhancing Contrastive Learning with Temporal Cognizance for Audio-Visual Representation Generation","display_name":"Enhancing Contrastive Learning with Temporal Cognizance for Audio-Visual Representation Generation","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W4224920427","doi":"https://doi.org/10.1109/icassp43922.2022.9747361"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9747361","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747361","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058760668","display_name":"Chandrashekhar Lavania","orcid":null},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Chandrashekhar Lavania","raw_affiliation_strings":["Amazon"],"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113697636","display_name":"Shiva Sundaram","orcid":null},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Shiva Sundaram","raw_affiliation_strings":["Amazon"],"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047128701","display_name":"Sundararajan Srinivasan","orcid":"https://orcid.org/0000-0002-3387-9889"},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Sundararajan Srinivasan","raw_affiliation_strings":["Amazon"],"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5050325468","display_name":"Katrin Kirchhoff","orcid":"https://orcid.org/0000-0002-6645-6030"},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Katrin Kirchhoff","raw_affiliation_strings":["Amazon"],"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5058760668"],"corresponding_institution_ids":["https://openalex.org/I4210089985"],"apc_list":null,"apc_paid":null,"fwci":0.1227,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.20537125,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":"33","issue":null,"first_page":"4728","last_page":"4732"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.8565361499786377},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8449459075927734},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6318305730819702},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5974904298782349},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5894177556037903},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.5381871461868286},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5353363752365112},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.4639572501182556},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.45033836364746094},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.45012402534484863},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3830583393573761},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3514821529388428},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.34729519486427307},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.10402107238769531}],"concepts":[{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.8565361499786377},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8449459075927734},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6318305730819702},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5974904298782349},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5894177556037903},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.5381871461868286},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5353363752365112},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.4639572501182556},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.45033836364746094},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.45012402534484863},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3830583393573761},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3514821529388428},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.34729519486427307},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.10402107238769531},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9747361","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747361","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.5899999737739563,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W2126579184","https://openalex.org/W2149628368","https://openalex.org/W2152790380","https://openalex.org/W2529272619","https://openalex.org/W2593116425","https://openalex.org/W2604379605","https://openalex.org/W2781922022","https://openalex.org/W2890052321","https://openalex.org/W2896457183","https://openalex.org/W2903758693","https://openalex.org/W2962960500","https://openalex.org/W2963155035","https://openalex.org/W2990408345","https://openalex.org/W3015974384","https://openalex.org/W3035118106","https://openalex.org/W3037309139","https://openalex.org/W3048939150","https://openalex.org/W3099638501","https://openalex.org/W3114214226","https://openalex.org/W3144366453","https://openalex.org/W3154596443","https://openalex.org/W3175300676","https://openalex.org/W4294576248","https://openalex.org/W6682948231","https://openalex.org/W6728272395","https://openalex.org/W6735927292","https://openalex.org/W6738607494","https://openalex.org/W6738806211","https://openalex.org/W6747045456","https://openalex.org/W6750591037","https://openalex.org/W6754048563","https://openalex.org/W6755207826","https://openalex.org/W6770805772","https://openalex.org/W6780294235","https://openalex.org/W6793736971"],"related_works":["https://openalex.org/W2366403280","https://openalex.org/W1495108544","https://openalex.org/W2091301346","https://openalex.org/W3148229873","https://openalex.org/W4389760904","https://openalex.org/W2150160875","https://openalex.org/W4242223894","https://openalex.org/W4306886878","https://openalex.org/W1517524280","https://openalex.org/W4323520239"],"abstract_inverted_index":{"Audio-visual":[0],"data":[1],"allows":[2],"us":[3],"to":[4,88],"leverage":[5],"different":[6],"modalities":[7],"for":[8],"downstream":[9],"tasks.":[10,46],"The":[11,47],"idea":[12],"being":[13],"individual":[14],"streams":[15],"can":[16],"complement":[17],"each":[18],"other":[19],"in":[20,26,56,69,86],"the":[21,53,98,105,108,122,129,145],"given":[22],"task,":[23],"thereby":[24],"resulting":[25],"a":[27,70,76,82],"model":[28,72],"with":[29,75],"improved":[30,104],"performance.":[31],"In":[32],"this":[33],"work,":[34],"we":[35,135],"present":[36],"our":[37,94],"experimental":[38],"results":[39,95],"on":[40,128,144],"action":[41,116],"recognition":[42],"and":[43],"video":[44,133],"summarization":[45],"proposed":[48],"modeling":[49],"approach":[50],"builds":[51],"upon":[52],"recent":[54],"advances":[55],"contrastive":[57,91,109],"loss":[58,80,110],"based":[59,111],"audio-visual":[60,65],"representation":[61],"learning.":[62],"Temporally":[63],"cognizant":[64],"discrimination":[66],"is":[67],"achieved":[68],"Transformer":[71],"by":[73],"learning":[74,89],"masked":[77],"feature":[78],"reconstruction":[79],"over":[81],"fixed":[83],"time":[84],"window":[85],"addition":[87,99],"via":[90],"loss.":[92],"Overall,":[93],"indicate":[96],"that":[97],"of":[100,107,119,140],"temporal":[101],"information":[102],"significantly":[103],"performance":[106],"framework.":[112],"We":[113],"achieve":[114],"an":[115,137],"classification":[117],"accuracy":[118],"66.2%":[120],"versus":[121],"next":[123],"best":[124],"baseline":[125],"at":[126],"64.7%":[127],"HMDB":[130],"dataset.":[131,147],"For":[132],"summarization,":[134],"attain":[136],"F1":[138],"score":[139],"43.5":[141],"verses":[142],"42.2":[143],"SumMe":[146]},"counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
