{"id":"https://openalex.org/W7125918754","doi":"https://doi.org/10.1109/smc58881.2025.11343496","title":"Contrastive Cross-modal Prototype Prediction and Fusion for Video Anomaly Detection <sup>*</sup>","display_name":"Contrastive Cross-modal Prototype Prediction and Fusion for Video Anomaly Detection <sup>*</sup>","publication_year":2025,"publication_date":"2025-10-05","ids":{"openalex":"https://openalex.org/W7125918754","doi":"https://doi.org/10.1109/smc58881.2025.11343496"},"language":null,"primary_location":{"id":"doi:10.1109/smc58881.2025.11343496","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343496","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108047889","display_name":"Y\u00ec W\u00e1ng","orcid":"https://orcid.org/0000-0001-5697-0717"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Junqiao Wang","raw_affiliation_strings":["Sun Yat-sen University,School of Computer Science and Engineering,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,School of Computer Science and Engineering,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124097433","display_name":"Jiawen Peng","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiawen Peng","raw_affiliation_strings":["Sun Yat-sen University,School of Computer Science and Engineering,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,School of Computer Science and Engineering,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008114041","display_name":"Jie Chen","orcid":"https://orcid.org/0000-0002-9254-4413"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiaxin Chen","raw_affiliation_strings":["Sun Yat-sen University,School of Computer Science and Engineering,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,School of Computer Science and Engineering,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5082077336","display_name":"J. Andy","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Andy J. Ma","raw_affiliation_strings":["Sun Yat-sen University,School of Computer Science and Engineering,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,School of Computer Science and Engineering,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5108047889"],"corresponding_institution_ids":["https://openalex.org/I157773358"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.83869913,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2358","last_page":"2363"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9789999723434448,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9789999723434448,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.0071000000461936,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0024999999441206455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.6632999777793884},{"id":"https://openalex.org/keywords/anomaly-detection","display_name":"Anomaly detection","score":0.546500027179718},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5044999718666077},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.48399999737739563},{"id":"https://openalex.org/keywords/optical-flow","display_name":"Optical flow","score":0.47519999742507935},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4544999897480011},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.45249998569488525},{"id":"https://openalex.org/keywords/rgb-color-model","display_name":"RGB color model","score":0.43160000443458557},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4244000017642975},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.40709999203681946}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7633000016212463},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6906999945640564},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.6632999777793884},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.546500027179718},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5044999718666077},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.48399999737739563},{"id":"https://openalex.org/C155542232","wikidata":"https://www.wikidata.org/wiki/Q736111","display_name":"Optical flow","level":3,"score":0.47519999742507935},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4544999897480011},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.45249998569488525},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43209999799728394},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.43160000443458557},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4244000017642975},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.40709999203681946},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.4000999927520752},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.37279999256134033},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.35260000824928284},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.32820001244544983},{"id":"https://openalex.org/C21200559","wikidata":"https://www.wikidata.org/wiki/Q7451068","display_name":"Sensitivity (control systems)","level":2,"score":0.3176000118255615},{"id":"https://openalex.org/C12997251","wikidata":"https://www.wikidata.org/wiki/Q567560","display_name":"Anomaly (physics)","level":2,"score":0.31439998745918274},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.3142000138759613},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.30379998683929443},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.30070000886917114},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.3005000054836273},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2976999878883362},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.29510000348091125},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2946999967098236},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.28130000829696655},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2759000062942505},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.25940001010894775},{"id":"https://openalex.org/C107551265","wikidata":"https://www.wikidata.org/wiki/Q1458245","display_name":"Displacement (psychology)","level":2,"score":0.25920000672340393},{"id":"https://openalex.org/C172849965","wikidata":"https://www.wikidata.org/wiki/Q3148875","display_name":"Reference frame","level":3,"score":0.2551000118255615}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/smc58881.2025.11343496","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343496","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5089877843856812,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"},{"score":0.4906499683856964,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W151377110","https://openalex.org/W1967456674","https://openalex.org/W1994616650","https://openalex.org/W2051224630","https://openalex.org/W2163612318","https://openalex.org/W2341058432","https://openalex.org/W2777342313","https://openalex.org/W2883429621","https://openalex.org/W2903380502","https://openalex.org/W2963610939","https://openalex.org/W2981650061","https://openalex.org/W2987228832","https://openalex.org/W2998993395","https://openalex.org/W3014352273","https://openalex.org/W3092920241","https://openalex.org/W3101466234","https://openalex.org/W3170135154","https://openalex.org/W3177187266","https://openalex.org/W3209806402","https://openalex.org/W4243541726","https://openalex.org/W4387757641","https://openalex.org/W4409365747"],"related_works":[],"abstract_inverted_index":{"Video":[0],"Anomaly":[1],"Detection":[2],"(VAD)":[3],"identifies":[4],"unexpected":[5],"events":[6],"by":[7,33,60],"learning":[8,53,61,177],"normal":[9,16,49,62,114,141,165,180],"behavior":[10],"from":[11,75,167],"surveillance":[12],"footage,":[13],"assuming":[14],"only":[15],"training":[17,155],"data":[18],"is":[19,190],"available.":[20],"Previous":[21],"methods,":[22],"which":[23],"focus":[24],"on":[25,41,120,220],"frame":[26],"reconstruction":[27],"or":[28],"prediction":[29,136],"tasks,":[30],"are":[31],"constrained":[32],"insufficient":[34],"semantic":[35,72],"sensitivity":[36],"due":[37],"to":[38,47,56,138,162,192],"a":[39,85,133],"reliance":[40],"pixel-level":[42],"errors":[43],"and":[44,90,118,123,158,176,205,215],"inadequate":[45],"adaptability":[46],"diverse":[48,203],"patterns.":[50,181],"Although":[51],"contrastive":[52,77],"methods":[54],"attempt":[55],"address":[57],"these":[58,81,197],"issues":[59],"subcategories":[63],"through":[64],"manually":[65],"constructed":[66],"positive-negative":[67,129],"pairs,":[68,130],"they":[69],"still":[70],"encounter":[71],"ambiguity":[73],"arising":[74],"inappropriate":[76],"strategies.":[78],"To":[79],"overcome":[80],"limitations,":[82],"we":[83,131],"propose":[84],"Contrastive":[86,107],"Cross-modal":[87],"Prototype":[88,108,150,185],"Prediction":[89,109],"Fusion":[91,151],"(C<sup":[92],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[93,95],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>P<sup":[94],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>F)":[96],"framework.":[97],"Specifically,":[98],"our":[99,199],"method":[100],"comprises":[101],"three":[102,221],"stages:":[103],"(1)":[104],"First,":[105],"the":[106,144,148,184],"(CPP)":[110],"module":[111,189],"separately":[112],"learns":[113],"patterns":[115,142],"of":[116,212],"appearance":[117],"motion":[119],"RGB":[121,157],"frames":[122],"optical":[124,159],"flow":[125,160],"inputs.":[126],"Without":[127],"constructing":[128],"perform":[132],"cross-view":[134],"prototype":[135,213],"task":[137],"discern":[139],"inherent":[140],"within":[143],"data.":[145],"(2)":[146],"Then,":[147],"Cross-Modal":[149],"(CMPF)":[152],"conducts":[153],"alternative":[154],"with":[156,218],"inputs":[161],"establish":[163],"comprehensive":[164],"representations":[166],"two":[168],"complementary":[169],"modalities,":[170],"enforcing":[171],"consistency":[172],"between":[173],"cross-modal":[174,216],"prototypes":[175],"semantically":[178],"rich":[179],"(3)":[182],"Finally,":[183],"Number":[186],"Adjustment":[187],"(PNA)":[188],"employed":[191],"mitigate":[193],"initialization":[194],"bias.":[195],"Integrating":[196],"components,":[198],"approach":[200],"adaptively":[201],"models":[202],"normalcy":[204],"enhances":[206],"anomaly":[207],"discrimination":[208],"via":[209],"joint":[210],"optimization":[211],"stability":[214],"consistency,":[217],"experiments":[219],"benchmarks":[222],"showing":[223],"its":[224],"superiority.":[225]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-01-29T00:00:00"}
