{"id":"https://openalex.org/W4388233659","doi":"https://doi.org/10.1145/3581783.3612301","title":"Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action and Gesture Recognition","display_name":"Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action and Gesture Recognition","publication_year":2023,"publication_date":"2023-10-26","ids":{"openalex":"https://openalex.org/W4388233659","doi":"https://doi.org/10.1145/3581783.3612301"},"language":"en","primary_location":{"id":"doi:10.1145/3581783.3612301","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612301","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5035467487","display_name":"Yujun Ma","orcid":"https://orcid.org/0000-0002-8014-0067"},"institutions":[{"id":"https://openalex.org/I43313876","display_name":"Dalian Maritime University","ror":"https://ror.org/002b7nr53","country_code":"CN","type":"education","lineage":["https://openalex.org/I43313876"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yujun Ma","raw_affiliation_strings":["Dalian Maritime University &amp; Massey University, Dalian, China"],"raw_orcid":"https://orcid.org/0000-0002-8014-0067","affiliations":[{"raw_affiliation_string":"Dalian Maritime University &amp; Massey University, Dalian, China","institution_ids":["https://openalex.org/I43313876"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054200099","display_name":"Benjia Zhou","orcid":"https://orcid.org/0000-0003-4883-5552"},"institutions":[{"id":"https://openalex.org/I111950717","display_name":"Macau University of Science and Technology","ror":"https://ror.org/03jqs2n27","country_code":"MO","type":"education","lineage":["https://openalex.org/I111950717","https://openalex.org/I4391767947"]}],"countries":["MO"],"is_corresponding":false,"raw_author_name":"Benjia Zhou","raw_affiliation_strings":["Macau University of Science and Technology, Macau SAR, China"],"raw_orcid":"https://orcid.org/0000-0003-4883-5552","affiliations":[{"raw_affiliation_string":"Macau University of Science and Technology, Macau SAR, China","institution_ids":["https://openalex.org/I111950717"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022594144","display_name":"Ruili Wang","orcid":"https://orcid.org/0000-0003-2899-9816"},"institutions":[{"id":"https://openalex.org/I43313876","display_name":"Dalian Maritime University","ror":"https://ror.org/002b7nr53","country_code":"CN","type":"education","lineage":["https://openalex.org/I43313876"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruili Wang","raw_affiliation_strings":["Dalian Maritime University &amp; Massey University, Dalian, China"],"raw_orcid":"https://orcid.org/0000-0003-2899-9816","affiliations":[{"raw_affiliation_string":"Dalian Maritime University &amp; Massey University, Dalian, China","institution_ids":["https://openalex.org/I43313876"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5042680345","display_name":"Pichao Wang","orcid":"https://orcid.org/0000-0002-1430-0237"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Pichao Wang","raw_affiliation_strings":["Amazon Prime Video, Seattle, WA, USA"],"raw_orcid":"https://orcid.org/0000-0002-1430-0237","affiliations":[{"raw_affiliation_string":"Amazon Prime Video, Seattle, WA, USA","institution_ids":["https://openalex.org/I1311688040"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.1333,"has_fulltext":false,"cited_by_count":19,"citation_normalized_percentile":{"value":0.89702797,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"3149","last_page":"3160"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11398","display_name":"Hand Gesture Recognition Systems","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12740","display_name":"Gait Recognition and Analysis","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7838283777236938},{"id":"https://openalex.org/keywords/rgb-color-model","display_name":"RGB color model","score":0.6512911915779114},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5900577902793884},{"id":"https://openalex.org/keywords/gesture","display_name":"Gesture","score":0.5243447422981262},{"id":"https://openalex.org/keywords/gesture-recognition","display_name":"Gesture recognition","score":0.4557610750198364},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.44794654846191406},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3686094284057617}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7838283777236938},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.6512911915779114},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5900577902793884},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.5243447422981262},{"id":"https://openalex.org/C159437735","wikidata":"https://www.wikidata.org/wiki/Q1519524","display_name":"Gesture recognition","level":3,"score":0.4557610750198364},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.44794654846191406},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3686094284057617}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3581783.3612301","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612301","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5899999737739563,"display_name":"Life below water","id":"https://metadata.un.org/sdg/14"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":15,"referenced_works":["https://openalex.org/W2799211965","https://openalex.org/W2944006115","https://openalex.org/W3092499653","https://openalex.org/W3096803658","https://openalex.org/W3110917564","https://openalex.org/W3119567997","https://openalex.org/W3157704410","https://openalex.org/W4205831148","https://openalex.org/W4282943820","https://openalex.org/W4292622422","https://openalex.org/W4312560592","https://openalex.org/W4319300817","https://openalex.org/W4321368636","https://openalex.org/W4321488262","https://openalex.org/W4367626083"],"related_works":["https://openalex.org/W2066003895","https://openalex.org/W2902873204","https://openalex.org/W2185750513","https://openalex.org/W2010878661","https://openalex.org/W3147379364","https://openalex.org/W2026258298","https://openalex.org/W3204639664","https://openalex.org/W2970836791","https://openalex.org/W2805039731","https://openalex.org/W2989699735"],"abstract_inverted_index":{"RGB-D":[0,27,119,218],"action":[1,29,120,219],"and":[2,19,30,49,91,97,121,137,148,167,182,189,194,220],"gesture":[3,31,122,221],"recognition":[4,32,222],"remain":[5],"an":[6,109],"interesting":[7],"topic":[8],"in":[9,22,208],"human-centered":[10],"scene":[11],"understanding,":[12],"primarily":[13],"due":[14],"to":[15,64,86,171,177],"the":[16,76,104,149,160,172,179,186],"multiple":[17,45,138,150],"granularities":[18],"large":[20],"variation":[21],"human":[23],"motion.":[24],"Although":[25],"many":[26],"based":[28],"approaches":[33,216],"have":[34],"demonstrated":[35],"remarkable":[36],"results":[37,207],"by":[38],"utilizing":[39],"highly":[40,80],"integrated":[41,81],"spatio-temporal":[42,82,100,140,152,165,174,211],"representations":[43],"across":[44],"modalities":[46],"(i.e.,":[47],"RGB":[48],"depth":[50],"data),":[51],"they":[52],"still":[53],"encounter":[54],"several":[55],"challenges.":[56],"Firstly,":[57],"vanilla":[58],"3D":[59,130],"convolution":[60],"makes":[61],"it":[62],"hard":[63],"capture":[65,178],"fine-grained":[66,145],"motion":[67],"differences":[68],"between":[69],"local":[70],"clips":[71],"under":[72],"different":[73],"modalities.":[74],"Secondly,":[75],"intricate":[77],"nature":[78],"of":[79,203],"modeling":[83],"can":[84,94],"lead":[85],"optimization":[87],"difficulties.":[88],"Thirdly,":[89],"duplicate":[90],"unnecessary":[92],"information":[93],"add":[95],"complexity":[96],"complicate":[98],"entangled":[99],"modeling.":[101],"To":[102],"address":[103],"above":[105],"issues,":[106],"we":[107],"propose":[108],"innovative":[110,205],"heuristic":[111],"architecture":[112],"called":[113],"Multi-stage":[114],"Factorized":[115],"Spatio-Temporal":[116],"(MFST)":[117],"for":[118],"recognition.":[123],"The":[124,142,200],"proposed":[125],"MFST":[126],"model":[127],"comprises":[128],"a":[129,209],"Central":[131],"Difference":[132],"Convolution":[133,188],"Stem":[134],"(CDC-Stem)":[135],"module":[136,162],"factorized":[139],"stages.":[141],"CDC-Stem":[143,161],"enriches":[144],"temporal":[146,183],"perception,":[147],"hierarchical":[151,180],"stages":[153,176],"construct":[154],"dimension-independent":[155],"higher-order":[156],"semantic":[157],"primitives.":[158],"Specifically,":[159],"captures":[163],"bottom-level":[164],"features":[166,184],"passes":[168],"them":[169],"successively":[170],"following":[173],"factored":[175],"spatial":[181],"through":[185],"Multi-Scale":[187,196],"Transformer":[190,197],"(MSC-Trans)":[191],"hybrid":[192],"block":[193],"Weight-shared":[195],"(WMS-Trans)":[198],"block.":[199],"seamless":[201],"integration":[202],"these":[204],"designs":[206],"robust":[210],"representation":[212],"that":[213],"outperforms":[214],"state-of-the-art":[215],"on":[217],"datasets.":[223]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":9},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
