{"id":"https://openalex.org/W7152657870","doi":"https://doi.org/10.48550/arxiv.2604.06327","title":"A Novel Automatic Framework for Speaker Drift Detection in Synthesized Speech","display_name":"A Novel Automatic Framework for Speaker Drift Detection in Synthesized Speech","publication_year":2026,"publication_date":"2026-04-07","ids":{"openalex":"https://openalex.org/W7152657870","doi":"https://doi.org/10.48550/arxiv.2604.06327"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.06327","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06327","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.06327","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133242608","display_name":"Jia-Hong Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Huang, Jia-Hong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133313138","display_name":"Seulgi Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Seulgi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133312237","display_name":"Yi Chieh Liu","orcid":"https://orcid.org/0000-0002-9675-416X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yi Chieh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021445324","display_name":"Yixian Shen","orcid":"https://orcid.org/0000-0001-8447-872X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Yixian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075073019","display_name":"Hongyi Zhu","orcid":"https://orcid.org/0000-0002-2633-7686"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Hongyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133248676","display_name":"Prayag Tiwari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tiwari, Prayag","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075331928","display_name":"Stevan Rudinac","orcid":"https://orcid.org/0000-0003-1904-8736"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rudinac, Stevan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133242129","display_name":"Evangelos Kanoulas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kanoulas, Evangelos","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5133242608"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6765999794006348,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6765999794006348,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.15489999949932098,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.02930000051856041,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.7321000099182129},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.5971999764442444},{"id":"https://openalex.org/keywords/cosine-similarity","display_name":"Cosine similarity","score":0.573199987411499},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.515999972820282},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.5001999735832214},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.48350000381469727},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4823000133037567},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4805999994277954},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.47620001435279846}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.7321000099182129},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7046999931335449},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6261000037193298},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.5971999764442444},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.573199987411499},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.515999972820282},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.5001999735832214},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.48350000381469727},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4823000133037567},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4805999994277954},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.47620001435279846},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.4562999904155731},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4361000061035156},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4316999912261963},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.42250001430511475},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4219000041484833},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.3790999948978424},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.3749000132083893},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3582000136375427},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.34880000352859497},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34700000286102295},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.296099990606308},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.28459998965263367},{"id":"https://openalex.org/C66905080","wikidata":"https://www.wikidata.org/wiki/Q17005494","display_name":"Binary classification","level":3,"score":0.2838999927043915},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.26080000400543213},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.06327","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06327","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.06327","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06327","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"diffusion-based":[1],"text-to-speech":[2],"(TTS)":[3],"models":[4,80],"achieve":[5],"high":[6],"naturalness":[7],"and":[8,76,96,145],"expressiveness,":[9],"yet":[10],"often":[11],"suffer":[12],"from":[13],"speaker":[14,22,51,63,99,120,138],"drift,":[15],"a":[16,25,57,114,141],"subtle,":[17],"gradual":[18],"shift":[19],"in":[20,38,154],"perceived":[21],"identity":[23],"within":[24],"single":[26],"utterance.":[27],"This":[28],"underexplored":[29],"phenomenon":[30],"undermines":[31],"the":[32,45,106,129],"coherence":[33],"of":[34,73,131],"synthetic":[35,116],"speech,":[36],"especially":[37],"long-form":[39],"or":[40],"interactive":[41],"settings.":[42],"We":[43,88],"introduce":[44],"first":[46],"automatic":[47],"framework":[48],"for":[49,92],"detecting":[50],"drift":[52,94,121,139],"by":[53],"formulating":[54],"it":[55],"as":[56,140],"binary":[58],"classification":[59],"task":[60],"over":[61],"utterance-level":[62],"consistency.":[64],"Our":[65,135],"method":[66],"computes":[67],"cosine":[68],"similarity":[69],"across":[70],"overlapping":[71],"segments":[72],"synthesized":[74],"speech":[75],"prompts":[77],"large":[78],"language":[79],"(LLMs)":[81],"with":[82,118,124,150],"structured":[83],"representations":[84],"to":[85],"assess":[86],"drift.":[87],"provide":[89],"theoretical":[90],"guarantees":[91],"cosine-based":[93],"detection":[95],"demonstrate":[97],"that":[98],"embeddings":[100],"exhibit":[101],"meaningful":[102],"geometric":[103,147],"clustering":[104],"on":[105],"unit":[107],"sphere.":[108],"To":[109],"support":[110],"evaluation,":[111],"we":[112],"construct":[113],"high-quality":[115],"benchmark":[117],"human-validated":[119],"annotations.":[122],"Experiments":[123],"multiple":[125],"state-of-the-art":[126],"LLMs":[127],"confirm":[128],"viability":[130],"this":[132],"embedding-to-reasoning":[133],"pipeline.":[134],"work":[136],"establishes":[137],"standalone":[142],"research":[143],"problem":[144],"bridges":[146],"signal":[148],"analysis":[149],"LLM-based":[151],"perceptual":[152],"reasoning":[153],"modern":[155],"TTS.":[156]},"counts_by_year":[],"updated_date":"2026-04-10T06:07:51.998497","created_date":"2026-04-10T00:00:00"}
