{"id":"https://openalex.org/W4415540820","doi":"https://doi.org/10.1145/3746027.3761982","title":"KLASSify to Verify: Audio-Visual Deepfake Detection Using SSL-based Audio and Handcrafted Visual Features","display_name":"KLASSify to Verify: Audio-Visual Deepfake Detection Using SSL-based Audio and Handcrafted Visual Features","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415540820","doi":"https://doi.org/10.1145/3746027.3761982"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3761982","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3761982","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083404931","display_name":"Ivan Kukanov","orcid":"https://orcid.org/0000-0003-4052-2754"},"institutions":[{"id":"https://openalex.org/I1325159990","display_name":"Ministry of Defence","ror":"https://ror.org/05m38qe70","country_code":"SG","type":"government","lineage":["https://openalex.org/I1325159990"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ivan Kukanov","raw_affiliation_strings":["KLASS Engineering and Solutions, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0003-4052-2754","affiliations":[{"raw_affiliation_string":"KLASS Engineering and Solutions, Singapore, Singapore","institution_ids":["https://openalex.org/I1325159990"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5059611340","display_name":"Jun Wah Ng","orcid":null},"institutions":[{"id":"https://openalex.org/I1325159990","display_name":"Ministry of Defence","ror":"https://ror.org/05m38qe70","country_code":"SG","type":"government","lineage":["https://openalex.org/I1325159990"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Jun Wah Ng","raw_affiliation_strings":["KLASS Engineering and Solutions, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0005-3299-8113","affiliations":[{"raw_affiliation_string":"KLASS Engineering and Solutions, Singapore, Singapore","institution_ids":["https://openalex.org/I1325159990"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.26220024,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"13707","last_page":"13713"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12357","display_name":"Digital Media Forensic Detection","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12357","display_name":"Digital Media Forensic Detection","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9940000176429749,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9927999973297119,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8939999938011169},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.8303999900817871},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.47859999537467957},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4717999994754791},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.39969998598098755},{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.3937999904155731},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3765000104904175}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8939999938011169},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.8303999900817871},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8241999745368958},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6883999705314636},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5516999959945679},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.47859999537467957},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4717999994754791},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.39969998598098755},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.3937999904155731},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3765000104904175},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.3727000057697296},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.3294999897480011},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3084999918937683},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.30570000410079956},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2888999879360199},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.27230000495910645},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.262800008058548}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3761982","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3761982","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W2004763266","https://openalex.org/W4313306150","https://openalex.org/W4382239873","https://openalex.org/W4386102876","https://openalex.org/W4386272941","https://openalex.org/W4406303205","https://openalex.org/W4406461482"],"related_works":[],"abstract_inverted_index":{"The":[0],"rapid":[1],"development":[2],"of":[3,29,133,141],"audio-driven":[4],"talking":[5],"head":[6],"generators":[7],"and":[8,31,50,82,115,121,139],"advanced":[9],"Text-To-Speech":[10],"(TTS)":[11],"models":[12],"has":[13],"led":[14],"to":[15,52,54,79,100],"more":[16],"sophisticated":[17],"temporal":[18,144],"deepfakes.":[19],"These":[20],"advances":[21],"highlight":[22],"the":[23,67,72,85,125,148],"need":[24],"for":[25,66,135,143],"robust":[26],"methods":[27],"capable":[28],"detecting":[30],"localizing":[32],"deepfakes,":[33],"even":[34],"under":[35],"novel,":[36],"unseen":[37],"attack":[38],"scenarios.":[39],"Current":[40],"state-of-the-art":[41],"deepfake":[42,136],"detectors,":[43],"while":[44],"accurate,":[45],"are":[46],"often":[47],"computationally":[48],"expensive":[49],"struggle":[51],"generalize":[53],"novel":[55],"manipulation":[56],"techniques.":[57],"To":[58],"address":[59],"these":[60],"challenges,":[61],"we":[62,75,88],"propose":[63],"multimodal":[64,129],"approaches":[65],"AV-Deepfake1M":[68],"2025":[69],"challenge.":[70],"For":[71,84],"visual":[73],"modality,":[74,87],"leverage":[76],"handcrafted":[77],"features":[78],"improve":[80],"interpretability":[81],"adaptability.":[83],"audio":[86,103,149],"adapt":[89],"a":[90,111],"self-supervised":[91],"learning":[92],"(SSL)":[93],"backbone":[94],"coupled":[95],"with":[96],"graph":[97],"attention":[98],"networks":[99],"capture":[101],"rich":[102],"representations,":[104],"improving":[105],"detection":[106],"robustness.":[107],"Our":[108],"approach":[109],"strikes":[110],"balance":[112],"between":[113],"performance":[114],"real-world":[116],"deployment,":[117],"focusing":[118],"on":[119],"resilience":[120],"potential":[122],"interpretability.":[123],"On":[124],"AV-Deepfake1M++":[126],"dataset,":[127],"our":[128],"system":[130],"achieves":[131],"AUC":[132],"92.78%":[134],"classification":[137],"task":[138],"IoU":[140],"0.3536":[142],"localization":[145],"using":[146],"only":[147],"modality.":[150]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-25T00:00:00"}
