{"id":"https://openalex.org/W7127587083","doi":"https://doi.org/10.48550/arxiv.2602.02980","title":"WST-X Series: Wavelet Scattering Transform for Interpretable Speech Deepfake Detection","display_name":"WST-X Series: Wavelet Scattering Transform for Interpretable Speech Deepfake Detection","publication_year":2026,"publication_date":"2026-02-03","ids":{"openalex":"https://openalex.org/W7127587083","doi":"https://doi.org/10.48550/arxiv.2602.02980"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.02980","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025606610","display_name":"Xi Xuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xuan, Xi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039257533","display_name":"Davide Carbone","orcid":"https://orcid.org/0000-0003-2859-6603"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Carbone, Davide","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086898291","display_name":"Ruchi Pandey","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Wenxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036114061","display_name":"Wenxin Zhang","orcid":"https://orcid.org/0009-0000-8916-6944"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pandey, Ruchi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125026539","display_name":"Tomi H. Kinnunen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kinnunen, Tomi H.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5025606610"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6654000282287598,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6654000282287598,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.06780000030994415,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.05480000004172325,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8788999915122986},{"id":"https://openalex.org/keywords/wavelet","display_name":"Wavelet","score":0.6600000262260437},{"id":"https://openalex.org/keywords/wavelet-transform","display_name":"Wavelet transform","score":0.5232999920845032},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5212000012397766},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4864000082015991},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.42989999055862427},{"id":"https://openalex.org/keywords/filter-bank","display_name":"Filter bank","score":0.3702000081539154},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.3431999981403351}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8788999915122986},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6687999963760376},{"id":"https://openalex.org/C47432892","wikidata":"https://www.wikidata.org/wiki/Q831390","display_name":"Wavelet","level":2,"score":0.6600000262260437},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5999000072479248},{"id":"https://openalex.org/C196216189","wikidata":"https://www.wikidata.org/wiki/Q2867","display_name":"Wavelet transform","level":3,"score":0.5232999920845032},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5212000012397766},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4864000082015991},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.42989999055862427},{"id":"https://openalex.org/C100515483","wikidata":"https://www.wikidata.org/wiki/Q3268235","display_name":"Filter bank","level":3,"score":0.3702000081539154},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.3431999981403351},{"id":"https://openalex.org/C46286280","wikidata":"https://www.wikidata.org/wiki/Q2414958","display_name":"Discrete wavelet transform","level":4,"score":0.3248000144958496},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32350000739097595},{"id":"https://openalex.org/C94915269","wikidata":"https://www.wikidata.org/wiki/Q1834857","display_name":"Detector","level":2,"score":0.3140999972820282},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.31299999356269836},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2937999963760376},{"id":"https://openalex.org/C2776291640","wikidata":"https://www.wikidata.org/wiki/Q2912517","display_name":"Value (mathematics)","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.28929999470710754},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.28139999508857727},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26010000705718994},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.2583000063896179},{"id":"https://openalex.org/C191486275","wikidata":"https://www.wikidata.org/wiki/Q210028","display_name":"Scattering","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.02980","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.02980","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.02980","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.02980","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.4736234247684479,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"this":[1],"work,":[2],"we":[3],"focus":[4],"on":[5,92,101],"front-end":[6],"design":[7],"for":[8,135,148],"speech":[9,149],"deepfake":[10,150],"detectors,":[11],"the":[12,16,22,57,68,74,93,102,141],"component":[13],"that":[14,66,107,119],"determines":[15],"discriminative":[17],"acoustic":[18],"cues":[19],"provided":[20],"to":[21,86],"classifier.":[23],"Existing":[24],"approaches":[25],"are":[26,35],"primarily":[27],"categorized":[28],"into":[29],"two":[30],"types.":[31],"Hand-crafted":[32],"filterbank":[33],"features":[34,147],"transparent":[36],"but":[37],"limited":[38],"in":[39,45],"capturing":[40,136],"higher-level":[41],"information.":[42],"SSL":[43],"features,":[44],"turn,":[46],"lack":[47],"interpretability":[48],"and":[49,104,128,145],"may":[50],"overlook":[51],"fine-grained":[52],"spectral":[53],"anomalies.":[54],"We":[55],"propose":[56],"WST-X":[58,108],"series,":[59],"a":[60,113,120],"novel":[61],"family":[62],"of":[63,70,143],"feature":[64],"extractors":[65],"combines":[67],"best":[69],"both":[71],"worlds":[72],"via":[73],"wavelet":[75,81],"scattering":[76],"transform":[77],"(WST),":[78],"which":[79],"cascades":[80],"convolutions":[82],"with":[83,98,126],"modulus":[84],"nonlinearities":[85],"produce":[87],"deformation-stable,":[88],"multi-scale":[89],"features.":[90],"Experiments":[91],"recent":[94],"Deepfake-Eval-2024":[95],"benchmark,":[96],"together":[97],"cross-dataset":[99],"evaluations":[100],"SpoofCeleb":[103],"In-the-Wild,":[105],"show":[106],"outperforms":[109],"existing":[110],"front-ends":[111],"by":[112],"wide":[114],"margin.":[115],"Our":[116],"analysis":[117],"reveals":[118],"small":[121],"averaging":[122],"scale":[123],"($J$),":[124],"combined":[125],"high-frequency":[127],"directional":[129],"resolutions":[130],"($Q$,":[131],"$L$),":[132],"is":[133,154],"critical":[134],"subtle":[137],"artifacts.":[138],"This":[139],"underscores":[140],"value":[142],"stable":[144],"translation-invariant":[146],"detection.":[151],"The":[152],"code":[153],"available":[155],"at":[156],"https://github.com/xxuan-acoustics/WST-X-Series.":[157]},"counts_by_year":[],"updated_date":"2026-05-02T06:04:40.494371","created_date":"2026-02-06T00:00:00"}
