{"id":"https://openalex.org/W4415883588","doi":"https://doi.org/10.1109/tpami.2025.3628653","title":"${\\text{CA}^{2}\\text{ST}}$: Cross-Attention in Audio, Space, and Time for Holistic Video Recognition","display_name":"${\\text{CA}^{2}\\text{ST}}$: Cross-Attention in Audio, Space, and Time for Holistic Video Recognition","publication_year":2025,"publication_date":"2025-11-04","ids":{"openalex":"https://openalex.org/W4415883588","doi":"https://doi.org/10.1109/tpami.2025.3628653","pmid":"https://pubmed.ncbi.nlm.nih.gov/41187029"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2025.3628653","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3628653","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079368750","display_name":"Jongseo Lee","orcid":"https://orcid.org/0000-0002-4072-4148"},"institutions":[{"id":"https://openalex.org/I35928602","display_name":"Kyung Hee University","ror":"https://ror.org/01zqcg218","country_code":"KR","type":"education","lineage":["https://openalex.org/I35928602"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Jongseo Lee","raw_affiliation_strings":["Kyung Hee University, Yongin, Republic of Korea","Kyung Hee University, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Kyung Hee University, Yongin, Republic of Korea","institution_ids":["https://openalex.org/I35928602"]},{"raw_affiliation_string":"Kyung Hee University, Republic of Korea","institution_ids":["https://openalex.org/I35928602"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002418613","display_name":"Joon\u2010Hyuk Chang","orcid":"https://orcid.org/0000-0003-2610-2323"},"institutions":[{"id":"https://openalex.org/I35928602","display_name":"Kyung Hee University","ror":"https://ror.org/01zqcg218","country_code":"KR","type":"education","lineage":["https://openalex.org/I35928602"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Joohyun Chang","raw_affiliation_strings":["Kyung Hee University, Yongin, Republic of Korea","Kyung Hee University, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Kyung Hee University, Yongin, Republic of Korea","institution_ids":["https://openalex.org/I35928602"]},{"raw_affiliation_string":"Kyung Hee University, Republic of Korea","institution_ids":["https://openalex.org/I35928602"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100609743","display_name":"Dongho Lee","orcid":"https://orcid.org/0000-0002-6801-9061"},"institutions":[{"id":"https://openalex.org/I35928602","display_name":"Kyung Hee University","ror":"https://ror.org/01zqcg218","country_code":"KR","type":"education","lineage":["https://openalex.org/I35928602"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Dongho Lee","raw_affiliation_strings":["Kyung Hee University, Yongin, Republic of Korea","Kyung Hee University, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Kyung Hee University, Yongin, Republic of Korea","institution_ids":["https://openalex.org/I35928602"]},{"raw_affiliation_string":"Kyung Hee University, Republic of Korea","institution_ids":["https://openalex.org/I35928602"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5062638357","display_name":"Jinwoo Choi","orcid":"https://orcid.org/0000-0001-7043-0610"},"institutions":[{"id":"https://openalex.org/I35928602","display_name":"Kyung Hee University","ror":"https://ror.org/01zqcg218","country_code":"KR","type":"education","lineage":["https://openalex.org/I35928602"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jinwoo Choi","raw_affiliation_strings":["Kyung Hee University, Yongin, Republic of Korea","Kyung Hee University, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Kyung Hee University, Yongin, Republic of Korea","institution_ids":["https://openalex.org/I35928602"]},{"raw_affiliation_string":"Kyung Hee University, Republic of Korea","institution_ids":["https://openalex.org/I35928602"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5079368750"],"corresponding_institution_ids":["https://openalex.org/I35928602"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.34965585,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"48","issue":"3","first_page":"2803","last_page":"2819"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.8360000252723694,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.8360000252723694,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.052000001072883606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.023099999874830246,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.6890000104904175},{"id":"https://openalex.org/keywords/rgb-color-model","display_name":"RGB color model","score":0.4884999990463257},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.3986000120639801},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.3758000135421753},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.35679998993873596},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.32269999384880066},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.3160000145435333}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8320000171661377},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.6890000104904175},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5600000023841858},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.4884999990463257},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4478999972343445},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3986000120639801},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3864000141620636},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.3758000135421753},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.35679998993873596},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.32269999384880066},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.3160000145435333},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.31529998779296875},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2854999899864197},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.27320000529289246},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2623000144958496},{"id":"https://openalex.org/C189693848","wikidata":"https://www.wikidata.org/wiki/Q6031064","display_name":"Information exchange","level":2,"score":0.2590999901294708},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.25029999017715454}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2025.3628653","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3628653","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:41187029","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41187029","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":72,"referenced_works":["https://openalex.org/W1522734439","https://openalex.org/W1927052826","https://openalex.org/W2066008732","https://openalex.org/W2194775991","https://openalex.org/W2342662179","https://openalex.org/W2593116425","https://openalex.org/W2625366777","https://openalex.org/W2770804203","https://openalex.org/W2883429621","https://openalex.org/W2895243423","https://openalex.org/W2907214745","https://openalex.org/W2963155035","https://openalex.org/W2963315828","https://openalex.org/W2963524571","https://openalex.org/W2990152177","https://openalex.org/W2990503944","https://openalex.org/W3010010212","https://openalex.org/W3015371781","https://openalex.org/W3037046522","https://openalex.org/W3118473641","https://openalex.org/W3126721948","https://openalex.org/W3163937874","https://openalex.org/W3175300676","https://openalex.org/W3177141386","https://openalex.org/W3196974791","https://openalex.org/W3204659849","https://openalex.org/W3206930349","https://openalex.org/W3206996142","https://openalex.org/W3207758636","https://openalex.org/W3216270236","https://openalex.org/W4205991051","https://openalex.org/W4214612132","https://openalex.org/W4214614183","https://openalex.org/W4221152513","https://openalex.org/W4312238419","https://openalex.org/W4312266966","https://openalex.org/W4312302951","https://openalex.org/W4312361652","https://openalex.org/W4312372834","https://openalex.org/W4312558481","https://openalex.org/W4312560592","https://openalex.org/W4312614039","https://openalex.org/W4312658081","https://openalex.org/W4312772544","https://openalex.org/W4312906857","https://openalex.org/W4323647359","https://openalex.org/W4385245566","https://openalex.org/W4386071707","https://openalex.org/W4386075636","https://openalex.org/W4386076314","https://openalex.org/W4386113246","https://openalex.org/W4390873312","https://openalex.org/W4390874283","https://openalex.org/W4391952644","https://openalex.org/W4392251991","https://openalex.org/W4392903137","https://openalex.org/W4393147243","https://openalex.org/W4393158264","https://openalex.org/W4396877837","https://openalex.org/W4402671548","https://openalex.org/W4402713111","https://openalex.org/W4402727512","https://openalex.org/W4402727523","https://openalex.org/W4402727574","https://openalex.org/W4402727889","https://openalex.org/W4402727913","https://openalex.org/W4402754197","https://openalex.org/W4404724811","https://openalex.org/W4404784276","https://openalex.org/W4404792725","https://openalex.org/W4412494001","https://openalex.org/W4413147595"],"related_works":[],"abstract_inverted_index":{"We":[0,96,115],"propose":[1,41],"Cross-Attention":[2,47,63,90],"in":[3,18,48,91],"Audio,":[4],"Space,":[5],"and":[6,23,50,67,73,93,109,131,157,163,170],"Time":[7,51],"(C$\\text{A}^{2}$A2ST),":[8],"a":[9,31,42],"transformer-based":[10],"method":[11],"for":[12],"holistic":[13,78,171],"video":[14,79,172],"recognition.":[15],"Recognizing":[16],"actions":[17],"videos":[19],"requires":[20],"both":[21],"spatial":[22,66],"temporal":[24,68],"understanding,":[25,80],"yet":[26],"most":[27],"existing":[28],"models":[29],"lack":[30],"balanced":[32,113,169],"spatio-temporal":[33],"understanding":[34],"of":[35,60],"videos.":[36],"To":[37],"address":[38],"this,":[39],"we":[40,81],"novel":[43],"two-stream":[44],"architecture,":[45],"called":[46],"Space":[49],"(CAST),":[52],"using":[53],"only":[54],"RGB":[55],"input.":[56],"In":[57,152],"each":[58],"layer":[59],"CAST,":[61],"Bottleneck":[62],"(B-CA)":[64],"enables":[65],"experts":[69,147,165],"to":[70,111],"exchange":[71,144],"information":[72,143],"make":[74],"synergistic":[75],"predictions.":[76],"For":[77],"extend":[82],"CAST":[83,99,156],"by":[84,159],"integrating":[85],"an":[86],"audio":[87,164],"expert,":[88],"forming":[89],"Visual":[92],"Audio":[94],"(CAVA).":[95],"validate":[97,117],"the":[98,118,141,149],"on":[100,120,137],"benchmarks":[101],"with":[102],"different":[103],"characteristics,":[104],"EPIC-KITCHENS-100,":[105],"Something-Something-V2,":[106],"Kinetics-400,":[107],"ActivityNet,":[108],"HD-EPIC":[110],"show":[112],"performance.":[114],"also":[116],"CAVA":[119,133,158],"audio-visual":[121],"action":[122],"recognition":[123],"benchmarks,":[124],"including":[125],"UCF-101,":[126],"VGG-Sound,":[127],"KineticsSound,":[128],"EPIC-":[129],"SOUNDS,":[130],"HD-EPIC-SOUNDS.":[132],"shows":[134],"favorable":[135],"performance":[136],"these":[138],"datasets,":[139],"demonstrating":[140],"effective":[142],"among":[145],"multiple":[146],"within":[148],"B-CA":[150],"module.":[151],"addition,":[153],"C$\\text{A}^{2}$A2ST":[154],"combines":[155],"employing":[160],"spatial,":[161],"temporal,":[162],"through":[166],"cross-attention,":[167],"achieving":[168],"understanding.":[173]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-04T00:00:00"}
