{"id":"https://openalex.org/W4396861023","doi":"https://doi.org/10.1145/3664815","title":"From CNNs to Transformers in Multimodal Human Action Recognition: A Survey","display_name":"From CNNs to Transformers in Multimodal Human Action Recognition: A Survey","publication_year":2024,"publication_date":"2024-05-13","ids":{"openalex":"https://openalex.org/W4396861023","doi":"https://doi.org/10.1145/3664815"},"language":"en","primary_location":{"id":"doi:10.1145/3664815","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664815","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3664815","source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3664815","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036110830","display_name":"Muhammad Bilal Shaikh","orcid":"https://orcid.org/0000-0001-9042-5018"},"institutions":[{"id":"https://openalex.org/I12079687","display_name":"Edith Cowan University","ror":"https://ror.org/05jhnwe22","country_code":"AU","type":"education","lineage":["https://openalex.org/I12079687"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Muhammad Bilal Shaikh","raw_affiliation_strings":["School of Engineering, Edith Cowan University, Joondalup, Australia and Molycop, Balcatta, Australia"],"raw_orcid":"https://orcid.org/0000-0001-9042-5018","affiliations":[{"raw_affiliation_string":"School of Engineering, Edith Cowan University, Joondalup, Australia and Molycop, Balcatta, Australia","institution_ids":["https://openalex.org/I12079687"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056138775","display_name":"Douglas Chai","orcid":"https://orcid.org/0000-0002-9004-7608"},"institutions":[{"id":"https://openalex.org/I12079687","display_name":"Edith Cowan University","ror":"https://ror.org/05jhnwe22","country_code":"AU","type":"education","lineage":["https://openalex.org/I12079687"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Douglas Chai","raw_affiliation_strings":["School of Engineering, Edith Cowan University, Joondalup, Australia"],"raw_orcid":"https://orcid.org/0000-0002-9004-7608","affiliations":[{"raw_affiliation_string":"School of Engineering, Edith Cowan University, Joondalup, Australia","institution_ids":["https://openalex.org/I12079687"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034900636","display_name":"Syed Mohammed Shamsul Islam","orcid":"https://orcid.org/0000-0002-3200-2903"},"institutions":[{"id":"https://openalex.org/I12079687","display_name":"Edith Cowan University","ror":"https://ror.org/05jhnwe22","country_code":"AU","type":"education","lineage":["https://openalex.org/I12079687"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Syed Muhammad Shamsul Islam","raw_affiliation_strings":["School of Science, Edith Cowan University, Joondalup, Australia"],"raw_orcid":"https://orcid.org/0000-0002-3200-2903","affiliations":[{"raw_affiliation_string":"School of Science, Edith Cowan University, Joondalup, Australia","institution_ids":["https://openalex.org/I12079687"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069697936","display_name":"Naveed Akhtar","orcid":"https://orcid.org/0000-0003-3406-673X"},"institutions":[{"id":"https://openalex.org/I165779595","display_name":"The University of Melbourne","ror":"https://ror.org/01ej9dk98","country_code":"AU","type":"education","lineage":["https://openalex.org/I165779595"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Naveed Akhtar","raw_affiliation_strings":["The University of Melbourne, Melbourne, Australia"],"raw_orcid":"https://orcid.org/0000-0003-3406-673X","affiliations":[{"raw_affiliation_string":"The University of Melbourne, Melbourne, Australia","institution_ids":["https://openalex.org/I165779595"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":9.1241,"has_fulltext":true,"cited_by_count":44,"citation_normalized_percentile":{"value":0.98672232,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"20","issue":"8","first_page":"1","last_page":"24"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11398","display_name":"Hand Gesture Recognition Systems","score":0.9914000034332275,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7382522821426392},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.6078736782073975},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.6017809510231018},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.5754323601722717},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5520117282867432},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5085586309432983},{"id":"https://openalex.org/keywords/multimodal-learning","display_name":"Multimodal learning","score":0.501563549041748},{"id":"https://openalex.org/keywords/paradigm-shift","display_name":"Paradigm shift","score":0.4874146282672882},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.4708554148674011},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.44564002752304077},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3877883553504944},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3724755644798279},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.12224262952804565}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7382522821426392},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.6078736782073975},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.6017809510231018},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.5754323601722717},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5520117282867432},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5085586309432983},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.501563549041748},{"id":"https://openalex.org/C43540301","wikidata":"https://www.wikidata.org/wiki/Q689971","display_name":"Paradigm shift","level":2,"score":0.4874146282672882},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.4708554148674011},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.44564002752304077},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3877883553504944},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3724755644798279},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.12224262952804565},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3664815","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664815","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3664815","source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2405.15813","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.15813","pdf_url":"https://arxiv.org/pdf/2405.15813","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:ro.ecu.edu.au:ecuworks2022-2026-5597","is_oa":true,"landing_page_url":"https://ro.ecu.edu.au/ecuworks2022-2026/4596","pdf_url":null,"source":{"id":"https://openalex.org/S2765015692","display_name":"Australasian Journal of Paramedicine","issn_l":"2202-7270","issn":["2202-7270"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Research outputs 2022 to 2026","raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3664815","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664815","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3664815","source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.5299999713897705,"id":"https://metadata.un.org/sdg/11","display_name":"Sustainable cities and communities"}],"awards":[{"id":"https://openalex.org/G2330925116","display_name":null,"funder_award_id":"DE230101058","funder_id":"https://openalex.org/F4320315885","funder_display_name":"Australian Government"}],"funders":[{"id":"https://openalex.org/F4320315885","display_name":"Australian Government","ror":"https://ror.org/0314h5y94"},{"id":"https://openalex.org/F4320320988","display_name":"Edith Cowan University","ror":"https://ror.org/05jhnwe22"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4396861023.pdf"},"referenced_works_count":83,"referenced_works":["https://openalex.org/W1993991024","https://openalex.org/W2053101950","https://openalex.org/W2056339039","https://openalex.org/W2186222003","https://openalex.org/W2224196924","https://openalex.org/W2274499208","https://openalex.org/W2294438834","https://openalex.org/W2309561466","https://openalex.org/W2317053768","https://openalex.org/W2425380982","https://openalex.org/W2466991859","https://openalex.org/W2527524734","https://openalex.org/W2533370895","https://openalex.org/W2580299352","https://openalex.org/W2601271987","https://openalex.org/W2612690371","https://openalex.org/W2619947201","https://openalex.org/W2768642967","https://openalex.org/W2773514261","https://openalex.org/W2802503116","https://openalex.org/W2887051120","https://openalex.org/W2896457183","https://openalex.org/W2904658540","https://openalex.org/W2906430385","https://openalex.org/W2912814345","https://openalex.org/W2914340986","https://openalex.org/W2914868535","https://openalex.org/W2917819557","https://openalex.org/W2940791683","https://openalex.org/W2941719259","https://openalex.org/W2942810189","https://openalex.org/W2944006115","https://openalex.org/W2963177663","https://openalex.org/W2963218601","https://openalex.org/W2965815071","https://openalex.org/W2966715458","https://openalex.org/W2971659033","https://openalex.org/W2975357369","https://openalex.org/W3004554579","https://openalex.org/W3008546095","https://openalex.org/W3011747142","https://openalex.org/W3011934803","https://openalex.org/W3014641072","https://openalex.org/W3015377432","https://openalex.org/W3023633125","https://openalex.org/W3025796084","https://openalex.org/W3034408878","https://openalex.org/W3037516378","https://openalex.org/W3088117316","https://openalex.org/W3094480255","https://openalex.org/W3105232955","https://openalex.org/W3111490429","https://openalex.org/W3113320078","https://openalex.org/W3134307371","https://openalex.org/W3140110584","https://openalex.org/W3143375397","https://openalex.org/W3154596443","https://openalex.org/W3155322285","https://openalex.org/W3175419009","https://openalex.org/W3193761636","https://openalex.org/W3207758636","https://openalex.org/W3213269132","https://openalex.org/W4214910852","https://openalex.org/W4225531458","https://openalex.org/W4234816175","https://openalex.org/W4246329541","https://openalex.org/W4253716310","https://openalex.org/W4282981352","https://openalex.org/W4287777632","https://openalex.org/W4287826895","https://openalex.org/W4297697565","https://openalex.org/W4301409532","https://openalex.org/W4306884028","https://openalex.org/W4307539314","https://openalex.org/W4318715744","https://openalex.org/W4361733981","https://openalex.org/W4390736386","https://openalex.org/W6600983433","https://openalex.org/W6704477683","https://openalex.org/W6754337694","https://openalex.org/W6765307894","https://openalex.org/W6810263219","https://openalex.org/W6955071965"],"related_works":["https://openalex.org/W4389505417","https://openalex.org/W2962931510","https://openalex.org/W4380551887","https://openalex.org/W4285159263","https://openalex.org/W2904518532","https://openalex.org/W4280529741","https://openalex.org/W4293919860","https://openalex.org/W2963650472","https://openalex.org/W4387634401","https://openalex.org/W4283332100"],"abstract_inverted_index":{"Due":[0],"to":[1,31,36,102,174,211],"its":[2],"widespread":[3],"applications,":[4],"human":[5],"action":[6,56,85],"recognition":[7,57,86],"is":[8,76,109,193],"one":[9],"of":[10,46,71,105,112,116,130,153,200,220],"the":[11,44,53,68,84,103,110,114,117,126,131,136,147,151,160,198,221,233,238],"most":[12],"widely":[13],"studied":[14],"research":[15,202],"problems":[16],"in":[17,52,73,141,150],"Computer":[18],"Vision.":[19],"Recent":[20],"studies":[21],"have":[22,59,172],"shown":[23],"that":[24,171],"addressing":[25],"it":[26],"using":[27],"multimodal":[28,106,222],"data":[29,41,119],"leads":[30],"superior":[32],"performance":[33],"as":[34],"compared":[35],"relying":[37],"on":[38,62,95,125,167,232],"a":[39,80,188],"single":[40],"modality.":[42],"During":[43],"adoption":[45],"deep":[47],"learning":[48],"for":[49,83,159,243],"visual":[50,74],"modelling":[51,75],"past":[54],"decade,":[55],"approaches":[58],"mainly":[60],"relied":[61],"Convolutional":[63],"Neural":[64],"Networks":[65],"(CNNs).":[66],"However,":[67],"recent":[69,168],"rise":[70],"Transformers":[72],"now":[77],"also":[78,145,216],"causing":[79],"paradigm":[81],"shift":[82],"task.":[87],"This":[88],"survey":[89,192],"captures":[90],"this":[91,142,191],"transition":[92],"while":[93,144],"focusing":[94],"Multimodal":[96],"Human":[97,184],"Action":[98,185],"Recognition":[99,186],"(MHAR).":[100],"Unique":[101],"induction":[104],"computational":[107],"models":[108],"process":[111],"\u2018fusing\u2019":[113],"features":[115],"individual":[118],"modalities.":[120],"Hence,":[121],"we":[122,165,236],"specifically":[123,194],"focus":[124],"fusion":[127,208],"design":[128,169,209],"aspects":[129],"MHAR":[132,177,201],"approaches.":[133],"We":[134,215],"analyze":[135],"classic":[137],"and":[138,155,207,227,240],"emerging":[139],"techniques":[140],"regard,":[143],"highlighting":[146],"popular":[148],"trends":[149],"adaption":[152],"CNN":[154],"Transformer":[156],"building":[157,231],"blocks":[158],"overall":[161],"problem.":[162],"In":[163],"particular,":[164],"emphasize":[166],"choices":[170,210],"led":[173],"more":[175],"efficient":[176],"models.":[178,214],"Unlike":[179],"existing":[180],"reviews,":[181],"which":[182],"discuss":[183,237],"from":[187,224],"broad":[189],"perspective,":[190],"aimed":[195],"at":[196],"pushing":[197],"boundaries":[199],"by":[203],"identifying":[204],"promising":[205],"architectural":[206],"train":[212],"practicable":[213],"provide":[217],"an":[218],"outlook":[219],"datasets":[223],"their":[225],"scale":[226],"evaluation":[228],"viewpoint.":[229],"Finally,":[230],"reviewed":[234],"literature,":[235],"challenges":[239],"future":[241],"avenues":[242],"MHAR.":[244]},"counts_by_year":[{"year":2026,"cited_by_count":15},{"year":2025,"cited_by_count":27},{"year":2024,"cited_by_count":2}],"updated_date":"2026-07-02T09:51:11.867554","created_date":"2024-05-14T00:00:00"}
