{"id":"https://openalex.org/W7131259250","doi":"https://doi.org/10.1109/vcip67698.2025.11396919","title":"M <sup>2</sup> S <sup>2</sup> L: Mamba-based Multi-Scale Spatial-temporal Learning for Video Anomaly Detection","display_name":"M <sup>2</sup> S <sup>2</sup> L: Mamba-based Multi-Scale Spatial-temporal Learning for Video Anomaly Detection","publication_year":2025,"publication_date":"2025-12-01","ids":{"openalex":"https://openalex.org/W7131259250","doi":"https://doi.org/10.1109/vcip67698.2025.11396919"},"language":null,"primary_location":{"id":"doi:10.1109/vcip67698.2025.11396919","is_oa":false,"landing_page_url":"https://doi.org/10.1109/vcip67698.2025.11396919","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Visual Communications and Image Processing (VCIP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100744137","display_name":"Yi Liu","orcid":"https://orcid.org/0000-0002-1058-0382"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yang Liu","raw_affiliation_strings":["Tongji University"],"affiliations":[{"raw_affiliation_string":"Tongji University","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071650569","display_name":"Boan Chen","orcid":"https://orcid.org/0000-0003-4484-3416"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Boan Chen","raw_affiliation_strings":["SJTU"],"affiliations":[{"raw_affiliation_string":"SJTU","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100685782","display_name":"Xiaoguang Zhu","orcid":"https://orcid.org/0000-0001-9554-2133"},"institutions":[{"id":"https://openalex.org/I84218800","display_name":"University of California, Davis","ror":"https://ror.org/05rrcem69","country_code":"US","type":"education","lineage":["https://openalex.org/I84218800"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaoguang Zhu","raw_affiliation_strings":["UC Davis"],"affiliations":[{"raw_affiliation_string":"UC Davis","institution_ids":["https://openalex.org/I84218800"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126720315","display_name":"Jing Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I141945490","display_name":"University of British Columbia","ror":"https://ror.org/03rmrcq20","country_code":"CA","type":"education","lineage":["https://openalex.org/I141945490"]},{"id":"https://openalex.org/I4210139194","display_name":"Universidad Braulio Carrillo","ror":"https://ror.org/041zaaf26","country_code":"CR","type":"education","lineage":["https://openalex.org/I4210139194"]}],"countries":["CA","CR"],"is_corresponding":false,"raw_author_name":"Jing Liu","raw_affiliation_strings":["UBC"],"affiliations":[{"raw_affiliation_string":"UBC","institution_ids":["https://openalex.org/I4210139194","https://openalex.org/I141945490"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126673191","display_name":"Peng Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I4210159968","display_name":"Duke Kunshan University","ror":"https://ror.org/04sr5ys16","country_code":"CN","type":"education","lineage":["https://openalex.org/I170897317","https://openalex.org/I37461747","https://openalex.org/I4210159968"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Sun","raw_affiliation_strings":["Duke Kunshan University"],"affiliations":[{"raw_affiliation_string":"Duke Kunshan University","institution_ids":["https://openalex.org/I4210159968"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5126675602","display_name":"Wei Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I79510175","display_name":"Cardiff University","ror":"https://ror.org/03kk7td41","country_code":"GB","type":"education","lineage":["https://openalex.org/I79510175"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wei Zhou","raw_affiliation_strings":["Cardiff University"],"affiliations":[{"raw_affiliation_string":"Cardiff University","institution_ids":["https://openalex.org/I79510175"]}]}],"institutions":[],"countries_distinct_count":5,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100744137"],"corresponding_institution_ids":["https://openalex.org/I116953780"],"apc_list":null,"apc_paid":null,"fwci":2.1819,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.92917281,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.002199999988079071,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.0020000000949949026,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/anomaly-detection","display_name":"Anomaly detection","score":0.7069000005722046},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.682699978351593},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5612000226974487},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5435000061988831},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5141000151634216},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4684999883174896},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.4027999937534332},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.37439998984336853},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.34689998626708984}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8004000186920166},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.7069000005722046},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.682699978351593},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6212999820709229},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5612000226974487},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5435000061988831},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5245000123977661},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5141000151634216},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4684999883174896},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.4027999937534332},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.39239999651908875},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.37439998984336853},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.34689998626708984},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3336000144481659},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3296000063419342},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.31630000472068787},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C10161872","wikidata":"https://www.wikidata.org/wiki/Q557891","display_name":"Motion estimation","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C2780624872","wikidata":"https://www.wikidata.org/wiki/Q852453","display_name":"Motion detection","level":3,"score":0.28450000286102295},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2842999994754791},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.2612000107765198},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2599000036716461},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.2567000091075897},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.2547999918460846},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.25130000710487366},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/vcip67698.2025.11396919","is_oa":false,"landing_page_url":"https://doi.org/10.1109/vcip67698.2025.11396919","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Visual Communications and Image Processing (VCIP)","raw_type":"proceedings-article"},{"id":"pmh:oai:https://orca.cardiff.ac.uk:185725","is_oa":false,"landing_page_url":"https://orca.cardiff.ac.uk/view/cardiffauthors/A2824239D.html>","pdf_url":null,"source":{"id":"https://openalex.org/S7407055383","display_name":"ORCA Online Research @Cardiff","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":null,"raw_type":"PeerReviewed"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1967456674","https://openalex.org/W2163612318","https://openalex.org/W2341058432","https://openalex.org/W2753526808","https://openalex.org/W2777342313","https://openalex.org/W2921491036","https://openalex.org/W2963045681","https://openalex.org/W2963610939","https://openalex.org/W2981650061","https://openalex.org/W2987228832","https://openalex.org/W3024312292","https://openalex.org/W3035240825","https://openalex.org/W3177187266","https://openalex.org/W3190318906","https://openalex.org/W3196890755","https://openalex.org/W4220900860","https://openalex.org/W4385486148","https://openalex.org/W4386799843","https://openalex.org/W4387968140","https://openalex.org/W4391019661","https://openalex.org/W4391614403","https://openalex.org/W4393970287","https://openalex.org/W4401024146","https://openalex.org/W4401878806","https://openalex.org/W4402685102","https://openalex.org/W4403601156","https://openalex.org/W4405642802","https://openalex.org/W4406860633","https://openalex.org/W4409129948","https://openalex.org/W4409356433","https://openalex.org/W4409365747","https://openalex.org/W4409507112","https://openalex.org/W4413967215","https://openalex.org/W4415708572","https://openalex.org/W7131085086"],"related_works":[],"abstract_inverted_index":{"Video":[0],"anomaly":[1,132],"detection":[2,24],"(VAD)":[3],"is":[4],"an":[5],"essential":[6],"task":[7],"in":[8,15,22,85],"the":[9],"image":[10],"processing":[11],"community":[12],"with":[13,26,35,165],"prospects":[14],"video":[16,30],"surveillance,":[17],"which":[18],"faces":[19],"fundamental":[20],"challenges":[21],"balancing":[23],"accuracy":[25],"computational":[27,64],"efficiency.":[28],"As":[29],"content":[31],"becomes":[32],"increasingly":[33],"complex":[34],"diverse":[36],"behavioral":[37,128],"patterns":[38],"and":[39,99,122,130,150,159,168],"contextual":[40],"scenarios,":[41],"traditional":[42],"VAD":[43],"approaches":[44],"struggle":[45],"to":[46,116],"provide":[47],"robust":[48],"assessment":[49],"for":[50,66,120,176],"modern":[51],"surveillance":[52,178],"systems.":[53],"Existing":[54],"methods":[55],"either":[56],"lack":[57],"comprehensive":[58],"spatial-temporal":[59,77],"modeling":[60,129],"or":[61],"require":[62],"excessive":[63],"resources":[65],"real-time":[67],"applications.":[68],"In":[69],"this":[70,86],"regard,":[71],"we":[72],"present":[73],"a":[74,112],"Mamba-based":[75],"multi-scale":[76],"learning":[78],"(M<sup":[79],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[80,82,142,144],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>S<sup":[81,143],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>L)":[83],"framework":[84,146],"paper.":[87],"The":[88],"proposed":[89],"method":[90],"employs":[91],"hierarchical":[92],"spatial":[93],"encoders":[94,101],"operating":[95],"at":[96],"multiple":[97],"granularities":[98],"multi-temporal":[100],"capturing":[102],"motion":[103,123],"dynamics":[104],"across":[105],"different":[106],"time":[107],"scales.":[108],"We":[109],"also":[110],"introduce":[111],"feature":[113],"decomposition":[114],"mechanism":[115],"enable":[117],"task-specific":[118],"optimization":[119],"appearance":[121],"reconstruction,":[124],"facilitating":[125],"more":[126],"nuanced":[127],"quality-aware":[131],"assessment.":[133],"Experiments":[134],"on":[135,154],"three":[136],"benchmark":[137],"datasets":[138],"demonstrate":[139],"that":[140],"M<sup":[141],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>L":[145],"achieves":[147],"98.5%,":[148],"92.1%,":[149],"77.9%":[151],"frame-level":[152],"AUCs":[153],"UCSD":[155],"Ped2,":[156],"CUHK":[157],"Avenue,":[158],"ShanghaiTech":[160],"respectively,":[161],"while":[162],"maintaining":[163],"efficiency":[164],"20.1G":[166],"FLOPs":[167],"45":[169],"FPS":[170],"inference":[171],"speed,":[172],"making":[173],"it":[174],"suitable":[175],"practical":[177],"deployment.":[179]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2026-02-25T00:00:00"}
