{"id":"https://openalex.org/W7160327002","doi":"https://doi.org/10.1109/wacv61042.2026.00251","title":"A-V Representation Learning via Audio Shift Prediction for Multimodal Deepfake Detection and Temporal Localization","display_name":"A-V Representation Learning via Audio Shift Prediction for Multimodal Deepfake Detection and Temporal Localization","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7160327002","doi":"https://doi.org/10.1109/wacv61042.2026.00251"},"language":null,"primary_location":{"id":"doi:10.1109/wacv61042.2026.00251","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61042.2026.00251","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033253439","display_name":"Ashutosh Anshul","orcid":"https://orcid.org/0000-0001-8883-7494"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ashutosh Anshul","raw_affiliation_strings":["Nanyang Technological University,College of Computing and Data Science,Singapore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nanyang Technological University,College of Computing and Data Science,Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135385850","display_name":"Eng Siong Chng","orcid":null},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Eng Siong Chng","raw_affiliation_strings":["Nanyang Technological University,College of Computing and Data Science,Singapore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nanyang Technological University,College of Computing and Data Science,Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5135369343","display_name":"Deepu Rajan","orcid":null},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Deepu Rajan","raw_affiliation_strings":["Nanyang Technological University,College of Computing and Data Science,Singapore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nanyang Technological University,College of Computing and Data Science,Singapore","institution_ids":["https://openalex.org/I172675005"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.6933429,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2553","last_page":"2563"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7750999927520752,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7750999927520752,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.02319999970495701,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.020500000566244125,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5076000094413757},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.3752000033855438},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3702999949455261},{"id":"https://openalex.org/keywords/statistical-learning","display_name":"Statistical learning","score":0.314300000667572},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.2996000051498413}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6714000105857849},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6029000282287598},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5076000094413757},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3752000033855438},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3702999949455261},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.33340001106262207},{"id":"https://openalex.org/C2982736386","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Statistical learning","level":2,"score":0.314300000667572},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2996000051498413},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28529998660087585},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.25760000944137573},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2574000060558319}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/wacv61042.2026.00251","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61042.2026.00251","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":51,"referenced_works":["https://openalex.org/W1927052826","https://openalex.org/W2019464758","https://openalex.org/W2558151185","https://openalex.org/W2808631503","https://openalex.org/W2982058372","https://openalex.org/W2983918066","https://openalex.org/W2984700035","https://openalex.org/W3081492798","https://openalex.org/W3093010840","https://openalex.org/W3093077034","https://openalex.org/W3173161217","https://openalex.org/W3174656926","https://openalex.org/W3175342695","https://openalex.org/W3176641851","https://openalex.org/W3196204467","https://openalex.org/W4214684483","https://openalex.org/W4214691743","https://openalex.org/W4214876496","https://openalex.org/W4221149434","https://openalex.org/W4280579728","https://openalex.org/W4312472072","https://openalex.org/W4312508181","https://openalex.org/W4316661129","https://openalex.org/W4319978495","https://openalex.org/W4320882980","https://openalex.org/W4360993864","https://openalex.org/W4372260514","https://openalex.org/W4382318331","https://openalex.org/W4385805162","https://openalex.org/W4386065949","https://openalex.org/W4386075957","https://openalex.org/W4386076085","https://openalex.org/W4386076638","https://openalex.org/W4386076652","https://openalex.org/W4386102876","https://openalex.org/W4386267173","https://openalex.org/W4386272941","https://openalex.org/W4386524605","https://openalex.org/W4386928847","https://openalex.org/W4390874333","https://openalex.org/W4392904345","https://openalex.org/W4394595619","https://openalex.org/W4400975066","https://openalex.org/W4402715910","https://openalex.org/W4402716164","https://openalex.org/W4402716253","https://openalex.org/W4402774452","https://openalex.org/W4402774698","https://openalex.org/W4403791323","https://openalex.org/W4405778913","https://openalex.org/W7133239857"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"multimodal":[1,19],"deepfake":[2,147],"detection":[3],"methods":[4],"typically":[5],"rely":[6],"on":[7,17,172],"single-stage":[8],"training,":[9],"which":[10],"can":[11],"cause":[12],"the":[13,61,127,134,154,163],"model":[14,135],"to":[15,28,39,98,115,126,136,181],"focus":[16],"dataset-specific":[18],"cues":[20],"while":[21],"missing":[22],"important":[23],"features":[24,132],"that":[25,58,84],"are":[26,72],"helpful":[27],"detect":[29,99],"unseen":[30],"manipulations,":[31],"thereby":[32],"limiting":[33],"generalization.":[34],"While":[35],"some":[36],"approaches":[37],"attempt":[38],"address":[40],"this":[41,96],"using":[42],"self-supervised":[43,111],"audio-visual":[44,87],"pretraining,":[45],"they":[46,55],"may":[47],"not":[48,158],"fully":[49,116],"exploit":[50],"cross-modal":[51,118],"temporal":[52,88,106,119,123,188],"information.":[53],"Also,":[54],"often":[56],"assume":[57],"manipulations":[59,138,183],"affect":[60],"entire":[62,140],"video,":[63],"ignoring":[64],"more":[65],"realistic":[66],"cases":[67],"where":[68],"only":[69],"short":[70],"segments":[71,148],"altered.":[73],"To":[74],"overcome":[75],"these":[76],"limitations,":[77],"we":[78],"propose":[79,109],"a":[80,110],"two-stage":[81],"training":[82],"framework":[83],"first":[85],"learns":[86],"alignment":[89,120],"in":[90,149],"real":[91],"videos":[92,141],"and":[93,100,169,184,186],"then":[94],"uses":[95],"information":[97],"localize":[101,146],"potential":[102],"deepfakes":[103],"by":[104],"identifying":[105],"inconsistencies.":[107],"We":[108],"shift-prediction":[112],"pretraining":[113],"objective":[114],"understand":[117],"across":[121,139],"multiple":[122],"shifts":[124],"applied":[125],"audio":[128],"input.":[129],"The":[130],"pretrained":[131,155],"enable":[133],"identify":[137],"as":[142,144],"well":[143],"accurately":[145],"partially":[150],"tampered":[151],"content.":[152],"Moreover,":[153],"components":[156],"do":[157],"require":[159],"task-specific":[160],"fine-tuning,":[161],"improving":[162],"model\u2019s":[164],"flexibility":[165],"for":[166],"both":[167],"classification":[168],"localization.":[170],"Experiments":[171],"benchmark":[173],"datasets":[174],"demonstrate":[175],"strong":[176],"within-dataset":[177],"performance,":[178],"robust":[179],"generalization":[180],"new":[182],"datasets,":[185],"accurate":[187],"localization.<sup":[189],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[190],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[191]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-06T00:00:00"}
