{"id":"https://openalex.org/W7156661830","doi":"https://doi.org/10.48550/arxiv.2604.23742","title":"RTCFake: Speech Deepfake Detection in Real-Time Communication","display_name":"RTCFake: Speech Deepfake Detection in Real-Time Communication","publication_year":2026,"publication_date":"2026-04-26","ids":{"openalex":"https://openalex.org/W7156661830","doi":"https://doi.org/10.48550/arxiv.2604.23742"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.23742","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23742","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.23742","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134770099","display_name":"Jun Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xue, Jun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124403526","display_name":"Zhuolin Yi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi, Zhuolin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134768728","display_name":"Yihuan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yihuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134803573","display_name":"Yanzhen Ren","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Yanzhen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134752786","display_name":"Yujie Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yujie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037493212","display_name":"Cunhang Fan","orcid":"https://orcid.org/0000-0001-6318-8803"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Cunhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134777906","display_name":"Zicheng Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Zicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134780957","display_name":"Yonghong Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yonghong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134762125","display_name":"Bo Cai","orcid":"https://orcid.org/0009-0009-0196-5756"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Bo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5134770099"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.6040999889373779,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.6040999889373779,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.10010000318288803,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.05209999904036522,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5493999719619751},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5321999788284302},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5217999815940857},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.49729999899864197},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.47999998927116394},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.44029998779296875},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.43059998750686646},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.40130001306533813}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8051999807357788},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5493999719619751},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5321999788284302},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5217999815940857},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.49729999899864197},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.47999998927116394},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4668000042438507},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.45669999718666077},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4535999894142151},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.44029998779296875},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.43059998750686646},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.40130001306533813},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.3896999955177307},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.3499000072479248},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3314000070095062},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.33059999346733093},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.3285999894142151},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.2953999936580658},{"id":"https://openalex.org/C518677369","wikidata":"https://www.wikidata.org/wiki/Q202833","display_name":"Social media","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.275299996137619},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.27410000562667847},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2727000117301941},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.25699999928474426},{"id":"https://openalex.org/C84525736","wikidata":"https://www.wikidata.org/wiki/Q831366","display_name":"Decision tree","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.23742","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23742","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.23742","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23742","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.4502536952495575}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"With":[0],"the":[1,8,35,59,122,161,188],"rapid":[2],"advancement":[3],"of":[4],"speech":[5,12,44,62,81,157],"generation":[6],"technologies,":[7],"threat":[9],"posed":[10],"by":[11,79],"deepfakes":[13],"in":[14,168,187],"real-time":[15],"communication":[16],"(RTC)":[17],"scenarios":[18],"has":[19],"intensified.":[20],"However,":[21],"existing":[22],"detection":[23],"studies":[24],"mainly":[25],"focus":[26],"on":[27],"offline":[28,96],"simulations":[29],"and":[30,50,87,97,130,142,152,172,178],"struggle":[31],"to":[32,113],"cope":[33],"with":[34],"complex":[36,144],"distortions":[37],"introduced":[38],"during":[39],"RTC":[40,67,140],"transmission,":[41],"including":[42],"unknown":[43],"enhancement":[45],"processes":[46],"(e.g.,":[47,90],"noise":[48,145,173],"suppression)":[49],"codec":[51],"compression.":[52],"To":[53],"address":[54],"this":[55,120],"challenge,":[56],"we":[57,102],"present":[58],"first":[60],"large-scale":[61],"deepfake":[63,158],"dataset":[64,76,124,184],"tailored":[65],"for":[66,156],"scenarios,":[68],"termed":[69],"\\textit{RTCFake},":[70],"totaling":[71],"approximately":[72],"600":[73],"hours.":[74],"The":[75,133,182],"is":[77,125,185],"constructed":[78],"transmitting":[80],"through":[82],"multiple":[83],"mainstream":[84],"social":[85],"media":[86],"conferencing":[88],"platforms":[89,141],"Zoom),":[91],"enabling":[92],"precise":[93],"pairing":[94],"between":[95],"online":[98],"speech.":[99],"In":[100,119],"addition,":[101],"propose":[103],"a":[104,149],"phoneme-guided":[105],"consistency":[106],"learning":[107],"(PCL)":[108],"strategy":[109,164],"that":[110],"enforces":[111],"models":[112],"learn":[114],"platform-invariant":[115],"semantic":[116],"structural":[117],"representations.":[118],"paper,":[121],"RTCFake":[123],"divided":[126],"into":[127],"training,":[128],"development,":[129],"evaluation":[131,134,154],"sets.":[132],"set":[135],"further":[136],"includes":[137],"both":[138,169],"unseen":[139,143],"conditions,":[146],"thereby":[147],"providing":[148],"more":[150],"realistic":[151],"challenging":[153],"benchmark":[155],"detection.":[159],"Furthermore,":[160],"proposed":[162],"PCL":[163],"achieves":[165],"significant":[166],"improvements":[167],"cross-platform":[170],"generalization":[171],"robustness,":[174],"offering":[175],"an":[176],"effective":[177],"generalizable":[179],"modeling":[180],"paradigm.":[181],"\\textit{RTCFake}":[183],"provided":[186],"{https://huggingface.co/datasets/JunXueTech/RTCFake}.":[189]},"counts_by_year":[],"updated_date":"2026-04-29T06:16:36.941037","created_date":"2026-04-29T00:00:00"}
