{"id":"https://openalex.org/W4388893704","doi":"https://doi.org/10.1109/euvip58404.2023.10323051","title":"MAiVAR-T: Multimodal Audio-image and Video Action Recognizer using Transformers","display_name":"MAiVAR-T: Multimodal Audio-image and Video Action Recognizer using Transformers","publication_year":2023,"publication_date":"2023-09-11","ids":{"openalex":"https://openalex.org/W4388893704","doi":"https://doi.org/10.1109/euvip58404.2023.10323051"},"language":"en","primary_location":{"id":"doi:10.1109/euvip58404.2023.10323051","is_oa":false,"landing_page_url":"https://doi.org/10.1109/euvip58404.2023.10323051","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 11th European Workshop on Visual Information Processing (EUVIP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036110830","display_name":"Muhammad Bilal Shaikh","orcid":"https://orcid.org/0000-0001-9042-5018"},"institutions":[{"id":"https://openalex.org/I12079687","display_name":"Edith Cowan University","ror":"https://ror.org/05jhnwe22","country_code":"AU","type":"education","lineage":["https://openalex.org/I12079687"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Muhammad Bilal Shaikh","raw_affiliation_strings":["Edith Cowan University,School of Engineering","School of Engineering, Edith Cowan University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Edith Cowan University,School of Engineering","institution_ids":["https://openalex.org/I12079687"]},{"raw_affiliation_string":"School of Engineering, Edith Cowan University","institution_ids":["https://openalex.org/I12079687"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056138775","display_name":"Douglas Chai","orcid":"https://orcid.org/0000-0002-9004-7608"},"institutions":[{"id":"https://openalex.org/I12079687","display_name":"Edith Cowan University","ror":"https://ror.org/05jhnwe22","country_code":"AU","type":"education","lineage":["https://openalex.org/I12079687"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Douglas Chai","raw_affiliation_strings":["Edith Cowan University,School of Engineering","School of Engineering, Edith Cowan University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Edith Cowan University,School of Engineering","institution_ids":["https://openalex.org/I12079687"]},{"raw_affiliation_string":"School of Engineering, Edith Cowan University","institution_ids":["https://openalex.org/I12079687"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034900636","display_name":"Syed Mohammed Shamsul Islam","orcid":"https://orcid.org/0000-0002-3200-2903"},"institutions":[{"id":"https://openalex.org/I12079687","display_name":"Edith Cowan University","ror":"https://ror.org/05jhnwe22","country_code":"AU","type":"education","lineage":["https://openalex.org/I12079687"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Syed Mohammed Shamsul Islam","raw_affiliation_strings":["Edith Cowan University,School of Science","School of Science, Edith Cowan University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Edith Cowan University,School of Science","institution_ids":["https://openalex.org/I12079687"]},{"raw_affiliation_string":"School of Science, Edith Cowan University","institution_ids":["https://openalex.org/I12079687"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069697936","display_name":"Naveed Akhtar","orcid":"https://orcid.org/0000-0003-3406-673X"},"institutions":[{"id":"https://openalex.org/I177877127","display_name":"The University of Western Australia","ror":"https://ror.org/047272k79","country_code":"AU","type":"education","lineage":["https://openalex.org/I177877127"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Naveed Akhtar","raw_affiliation_strings":["The University of Western Australia,Department of Computer Science &amp; Software Engineering"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Western Australia,Department of Computer Science &amp; Software Engineering","institution_ids":["https://openalex.org/I177877127"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.4286,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.63515951,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11398","display_name":"Hand Gesture Recognition Systems","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9944000244140625,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8249802589416504},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.6983180046081543},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5241489410400391},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5203754305839539},{"id":"https://openalex.org/keywords/multimodal-learning","display_name":"Multimodal learning","score":0.4987640380859375},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.47155046463012695},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4664181172847748},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4541262686252594},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.39010754227638245},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.36978304386138916}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8249802589416504},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.6983180046081543},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5241489410400391},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5203754305839539},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.4987640380859375},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.47155046463012695},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4664181172847748},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4541262686252594},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.39010754227638245},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.36978304386138916},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/euvip58404.2023.10323051","is_oa":false,"landing_page_url":"https://doi.org/10.1109/euvip58404.2023.10323051","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 11th European Workshop on Visual Information Processing (EUVIP)","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.atira.dk:publications/37866de0-ac2f-4b0d-addb-ab455a3edf9d","is_oa":false,"landing_page_url":"https://research-repository.uwa.edu.au/en/publications/37866de0-ac2f-4b0d-addb-ab455a3edf9d","pdf_url":null,"source":{"id":"https://openalex.org/S4306402523","display_name":"UWA Profiles and Research Repository (University of Western Australia)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I177877127","host_organization_name":"The University of Western Australia","host_organization_lineage":["https://openalex.org/I177877127"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Shaikh, M B, Chai, D, Shamsul Islam, S M & Akhtar, N 2023, MAiVAR-T : Multimodal Audio-image and Video Action Recognizer using Transformers. in 2023 11th European Workshop on Visual Information Processing, EUVIP 2023 - Proceedings. Proceedings - European Workshop on Visual Information Processing, EUVIP, IEEE, Institute of Electrical and Electronics Engineers, 11th European Workshop on Visual Information Processing, Gjovik, Norway, 11/09/23. https://doi.org/10.1109/EUVIP58404.2023.10323051","raw_type":"info:eu-repo/semantics/conferenceObject"},{"id":"pmh:oai:pure.atira.dk:publications/37866de0-ac2f-4b0d-addb-ab455a3edf9d","is_oa":false,"landing_page_url":"http://www.scopus.com/inward/record.url?scp=85179508077&partnerID=8YFLogxK","pdf_url":null,"source":{"id":"https://openalex.org/S4306402523","display_name":"UWA Profiles and Research Repository (University of Western Australia)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I177877127","host_organization_name":"The University of Western Australia","host_organization_lineage":["https://openalex.org/I177877127"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Shaikh , M B , Chai , D , Shamsul Islam , S M &amp; Akhtar , N 2023 , MAiVAR-T : Multimodal Audio-image and Video Action Recognizer using Transformers . in 2023 11th European Workshop on Visual Information Processing, EUVIP 2023 - Proceedings . Proceedings - European Workshop on Visual Information Processing, EUVIP , IEEE, Institute of Electrical and Electronics Engineers , 11th European Workshop on Visual Information Processing , Gjovik , Norway , 11/09/23 . https://doi.org/10.1109/EUVIP58404.2023.10323051","raw_type":"contributionToPeriodical"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320320988","display_name":"Edith Cowan University","ror":"https://ror.org/05jhnwe22"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W24089286","https://openalex.org/W1522301498","https://openalex.org/W1522734439","https://openalex.org/W2002591263","https://openalex.org/W2009059481","https://openalex.org/W2064675550","https://openalex.org/W2095705004","https://openalex.org/W2147800946","https://openalex.org/W2194775991","https://openalex.org/W2507009361","https://openalex.org/W2570915410","https://openalex.org/W2618530766","https://openalex.org/W2767290858","https://openalex.org/W2787690100","https://openalex.org/W2896457183","https://openalex.org/W2964109005","https://openalex.org/W2968553732","https://openalex.org/W2971680695","https://openalex.org/W2990152177","https://openalex.org/W3021197603","https://openalex.org/W3034658206","https://openalex.org/W3091959638","https://openalex.org/W3094502228","https://openalex.org/W3152403802","https://openalex.org/W3175080943","https://openalex.org/W3175419009","https://openalex.org/W4214612132","https://openalex.org/W4220894980","https://openalex.org/W4226025707","https://openalex.org/W4316660066","https://openalex.org/W4383532469","https://openalex.org/W4385245566","https://openalex.org/W6600983433","https://openalex.org/W6631190155","https://openalex.org/W6674330103","https://openalex.org/W6755207826","https://openalex.org/W6784333009"],"related_works":["https://openalex.org/W73545470","https://openalex.org/W4224266612","https://openalex.org/W2383394264","https://openalex.org/W4320153225","https://openalex.org/W4293261942","https://openalex.org/W3125968744","https://openalex.org/W2167701463","https://openalex.org/W2110287964","https://openalex.org/W4307407935","https://openalex.org/W649759291"],"abstract_inverted_index":{"In":[0,123],"line":[1],"with":[2,51,94],"the":[3,8,44,57,66,71,78,85,95,109,153,159,181],"human":[4,61],"capacity":[5],"to":[6,32,55,98,107,125],"perceive":[7],"world":[9],"by":[10],"simultaneously":[11],"processing":[12],"and":[13,22,48,81,116,166,176],"integrating":[14,164],"high-dimensional":[15],"inputs":[16],"from":[17,77,163],"multiple":[18],"modalities":[19,168],"like":[20],"vision":[21],"audio,":[23],"we":[24],"propose":[25],"a":[26,52,100,147],"novel":[27],"model,":[28],"MAiVAR-T":[29,69,137],"(Multimodal":[30],"Audio-Image":[31],"Video":[33],"Action":[34],"Recognition":[35],"Transformer).":[36],"This":[37,103,157],"model":[38],"employs":[39],"an":[40],"intuitive":[41],"approach":[42,105],"for":[43,169],"combination":[45],"of":[46,59,68,73,178],"audio-image":[47,90],"video":[49,96,117,135,167],"modalities,":[50,118,136],"primary":[53],"aim":[54],"escalate":[56],"effectiveness":[58],"multimodal":[60],"action":[62,121,149,170],"recognition":[63,150,171],"(MHAR).":[64],"At":[65],"core":[67],"lies":[70],"significance":[72],"distilling":[74],"substantial":[75],"representations":[76],"audio":[79,115,133,165],"modality":[80,97],"transmuting":[82],"these":[83],"into":[84],"image":[86],"domain.":[87],"Subsequently,":[88],"this":[89],"depiction":[91],"is":[92,184],"fused":[93],"formulate":[99],"unified":[101],"representation.":[102],"concerted":[104],"strives":[106],"exploit":[108],"contextual":[110],"richness":[111],"inherent":[112],"in":[113],"both":[114],"thereby":[119],"promoting":[120],"recognition.":[122],"contrast":[124],"existing":[126],"state-of-the-art":[127],"strategies":[128],"that":[129],"focus":[130],"solely":[131],"on":[132,146],"or":[134],"demonstrates":[138],"superior":[139],"performance.":[140,156],"Our":[141],"extensive":[142],"empirical":[143],"evaluations":[144],"conducted":[145],"benchmark":[148],"dataset":[151],"corroborate":[152],"model\u2019s":[154],"remarkable":[155],"underscores":[158],"potential":[160],"enhancements":[161],"derived":[162],"purposes.":[172],"To":[173],"ensure":[174],"transparency":[175],"reproducibility":[177],"our":[179],"work,":[180],"source":[182],"code":[183],"made":[185],"publicly":[186],"available":[187],"at":[188],"https://bit.ly/43do8DH.":[189]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":2}],"updated_date":"2026-07-02T09:51:11.867554","created_date":"2025-10-10T00:00:00"}
