{"id":"https://openalex.org/W4416771725","doi":"https://doi.org/10.1109/sped67700.2025.11252230","title":"Augmented Transfer Learning for Synthetic Speech Detection","display_name":"Augmented Transfer Learning for Synthetic Speech Detection","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416771725","doi":"https://doi.org/10.1109/sped67700.2025.11252230"},"language":null,"primary_location":{"id":"doi:10.1109/sped67700.2025.11252230","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sped67700.2025.11252230","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106106234","display_name":"Irina Mutica","orcid":null},"institutions":[{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["RO"],"is_corresponding":true,"raw_author_name":"Irina Mutica","raw_affiliation_strings":["National University of Science and Technology POLITEHNICA,Speech and Dialogue Research Laboratory (SpeeD),Bucharest,Romania"],"affiliations":[{"raw_affiliation_string":"National University of Science and Technology POLITEHNICA,Speech and Dialogue Research Laboratory (SpeeD),Bucharest,Romania","institution_ids":["https://openalex.org/I61641377"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020802166","display_name":"\u015eerban Mihalache","orcid":"https://orcid.org/0000-0001-6540-2359"},"institutions":[{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["RO"],"is_corresponding":false,"raw_author_name":"Serban Mihalache","raw_affiliation_strings":["National University of Science and Technology POLITEHNICA,Speech and Dialogue Research Laboratory (SpeeD),Bucharest,Romania"],"affiliations":[{"raw_affiliation_string":"National University of Science and Technology POLITEHNICA,Speech and Dialogue Research Laboratory (SpeeD),Bucharest,Romania","institution_ids":["https://openalex.org/I61641377"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006881797","display_name":"Gheorghe Pop","orcid":"https://orcid.org/0000-0002-8193-7991"},"institutions":[{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["RO"],"is_corresponding":false,"raw_author_name":"Gheorghe Pop","raw_affiliation_strings":["National University of Science and Technology POLITEHNICA,Speech and Dialogue Research Laboratory (SpeeD),Bucharest,Romania"],"affiliations":[{"raw_affiliation_string":"National University of Science and Technology POLITEHNICA,Speech and Dialogue Research Laboratory (SpeeD),Bucharest,Romania","institution_ids":["https://openalex.org/I61641377"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5012068854","display_name":"Drago\u015f Burileanu","orcid":"https://orcid.org/0000-0002-7675-3506"},"institutions":[{"id":"https://openalex.org/I61641377","display_name":"Universitatea Na\u021bional\u0103 de \u0218tiin\u021b\u0103 \u0219i Tehnologie Politehnica Bucure\u0219ti","ror":"https://ror.org/0558j5q12","country_code":"RO","type":"education","lineage":["https://openalex.org/I61641377"]}],"countries":["RO"],"is_corresponding":false,"raw_author_name":"Dragos Burileanu","raw_affiliation_strings":["National University of Science and Technology POLITEHNICA,Speech and Dialogue Research Laboratory (SpeeD),Bucharest,Romania"],"affiliations":[{"raw_affiliation_string":"National University of Science and Technology POLITEHNICA,Speech and Dialogue Research Laboratory (SpeeD),Bucharest,Romania","institution_ids":["https://openalex.org/I61641377"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5106106234"],"corresponding_institution_ids":["https://openalex.org/I61641377"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.20613578,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"7","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8877999782562256,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8877999782562256,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.0494999997317791,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.01889999955892563,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.8169999718666077},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.6050999760627747},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.6040999889373779},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5867999792098999},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.5174999833106995},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.49549999833106995},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.47940000891685486},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.40139999985694885},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.39070001244544983}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.8169999718666077},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7494999766349792},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6226999759674072},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.6050999760627747},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6040999889373779},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5867999792098999},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5842999815940857},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.5174999833106995},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.49549999833106995},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.47940000891685486},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.40139999985694885},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.39070001244544983},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.34389999508857727},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.33809998631477356},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.33649998903274536},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3262999951839447},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.321399986743927},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.3075000047683716},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.30660000443458557},{"id":"https://openalex.org/C88516994","wikidata":"https://www.wikidata.org/wiki/Q1268863","display_name":"Dynamic time warping","level":2,"score":0.26739999651908875},{"id":"https://openalex.org/C28006648","wikidata":"https://www.wikidata.org/wiki/Q6934509","display_name":"Multi-task learning","level":3,"score":0.2648000121116638},{"id":"https://openalex.org/C81299745","wikidata":"https://www.wikidata.org/wiki/Q334269","display_name":"Transfer function","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.26030001044273376},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2590999901294708},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.2567000091075897},{"id":"https://openalex.org/C137584468","wikidata":"https://www.wikidata.org/wiki/Q35395","display_name":"Phonetics","level":2,"score":0.2547999918460846}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/sped67700.2025.11252230","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sped67700.2025.11252230","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W2989571531","https://openalex.org/W2989851933","https://openalex.org/W3126967250","https://openalex.org/W4285149944","https://openalex.org/W4312743281","https://openalex.org/W4372346093","https://openalex.org/W4392904433","https://openalex.org/W4392943326","https://openalex.org/W4394852487","https://openalex.org/W4396613654","https://openalex.org/W4398186462","https://openalex.org/W4401114208","https://openalex.org/W4406821510"],"related_works":[],"abstract_inverted_index":{"Speech":[0],"synthesis":[1],"models":[2],"pose":[3],"significant":[4],"risks,":[5],"as":[6,77,79],"modern":[7,109],"text-to-speech":[8],"(TTS)":[9],"and":[10,54,65,84,86,106,119,136],"voice":[11],"conversion":[12],"(VC)":[13],"systems":[14],"can":[15,22],"generate":[16],"highly":[17,38],"realistic":[18],"synthetic":[19],"voices":[20],"that":[21,138],"be":[23],"exploited":[24],"for":[25],"fraud,":[26],"impersonation,":[27],"or":[28],"disinformation.":[29],"In":[30],"this":[31],"paper,":[32],"we":[33],"focus":[34],"on":[35,141],"a":[36,43,56,80,95,142],"single,":[37],"effective":[39],"transfer":[40],"learning":[41,87],"solution:":[42],"pretrained":[44,144],"EfficientNetV2":[45],"backbone":[46],"model":[47],"using":[48],"augmented":[49],"spectrogram":[50,59],"inputs.":[51],"We":[52],"employed":[53],"evaluated":[55],"suite":[57],"of":[58],"augmentation":[60,140],"techniques":[61],"(adaptive":[62],"resizing,":[63],"SpecAugment,":[64],"Mixup),":[66],"regularization":[67],"methods":[68],"(dropout,":[69],"stochastic":[70],"depth,":[71],"label":[72],"smoothing,":[73],"selective":[74],"layer":[75],"freezing),":[76],"well":[78],"progressive":[81],"resizing":[82],"approach":[83],"optimization":[85],"rate":[88],"strategies.":[89],"For":[90],"the":[91,99,133],"Fake-or-Real":[92],"(FoR)":[93],"dataset,":[94],"reference":[96],"benchmark":[97],"in":[98],"field,":[100],"comprising":[101],"speech":[102],"samples":[103],"both":[104],"genuine":[105],"generated":[107],"by":[108,127],"TTS/VC":[110],"models,":[111],"our":[112],"system":[113],"achieves":[114],"$97.2":[115],"\\%$":[116,121],"validation":[117],"accuracy":[118],"$91.1":[120],"test":[122],"accuracy,":[123],"surpassing":[124],"prior":[125],"performance":[126,148],"up":[128],"to":[129],"$19.6":[130],"\\%$,":[131],"reducing":[132],"generalization":[134],"gap,":[135],"indicating":[137],"focused":[139],"single":[143],"network":[145],"yields":[146],"state-of-the-art":[147],"with":[149],"minimal":[150],"complexity":[151],"increase.":[152]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-27T00:00:00"}
