{"id":"https://openalex.org/W7125125048","doi":"https://doi.org/10.48550/arxiv.2601.13948","title":"Stream-Voice-Anon: Enhancing Utility of Real-Time Speaker Anonymization via Neural Audio Codec and Language Models","display_name":"Stream-Voice-Anon: Enhancing Utility of Real-Time Speaker Anonymization via Neural Audio Codec and Language Models","publication_year":2026,"publication_date":"2026-01-20","ids":{"openalex":"https://openalex.org/W7125125048","doi":"https://doi.org/10.48550/arxiv.2601.13948"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.13948","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5080580365","display_name":"Nikita Kuzmin","orcid":"https://orcid.org/0000-0002-8260-7118"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kuzmin, Nikita","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123479629","display_name":"Songting Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Songting","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056758395","display_name":"Kong Aik Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Kong Aik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123513898","display_name":"Eng Siong Chng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chng, Eng Siong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5080580365"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6942999958992004,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6942999958992004,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.05079999938607216,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.05009999871253967,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.5651999711990356},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4722000062465668},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4636000096797943},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.4593999981880188},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.4366999864578247},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.3797999918460846},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.3646000027656555},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.3555000126361847}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8269000053405762},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6069999933242798},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.5651999711990356},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4722000062465668},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4636000096797943},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.4593999981880188},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.4366999864578247},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.3797999918460846},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.3646000027656555},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36160001158714294},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3555000126361847},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.34850001335144043},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.31850001215934753},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3059000074863434},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.30329999327659607},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.2962000072002411},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.2630000114440918},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.2547999918460846},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.13948","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.13948","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.13948","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.13948","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.6643191576004028,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Protecting":[0],"speaker":[1,11,27,105,127],"identity":[2],"is":[3],"crucial":[4],"for":[5,50,61,72,90,113],"online":[6,56],"voice":[7,62],"applications,":[8],"yet":[9],"streaming":[10,51,91,176],"anonymization":[12,95,98],"(SA)":[13],"remains":[14],"underexplored.":[15],"Recent":[16],"research":[17],"has":[18],"demonstrated":[19],"that":[20,116],"neural":[21],"audio":[22],"codec":[23],"(NAC)":[24],"provides":[25],"superior":[26],"feature":[28],"disentanglement":[29,119],"and":[30,47,108,134,163,186],"linguistic":[31,45],"fidelity.":[32],"NAC":[33,87],"can":[34],"also":[35],"be":[36],"used":[37],"with":[38],"causal":[39,85],"language":[40],"models":[41],"(LM)":[42],"to":[43,125,138,158,167,172],"enhance":[44],"fidelity":[46],"prompt":[48,110],"control":[49],"tasks.":[52],"However,":[53],"existing":[54],"NAC-based":[55],"LM":[57,114],"systems":[58],"are":[59],"designed":[60],"conversion":[63],"(VC)":[64],"rather":[65],"than":[66],"anonymization,":[67],"lacking":[68],"the":[69,118,146,173],"techniques":[70],"required":[71],"privacy":[73,187],"protection.":[74],"Building":[75],"on":[76],"these":[77],"advances,":[78],"we":[79,131],"present":[80],"Stream-Voice-Anon,":[81],"which":[82],"adapts":[83],"modern":[84],"LM-based":[86],"architectures":[88],"specifically":[89],"SA":[92],"by":[93],"integrating":[94],"techniques.":[96],"Our":[97],"approach":[99],"incorporates":[100],"pseudo-speaker":[101],"representation":[102],"sampling,":[103],"a":[104],"embedding":[106],"mixing":[107],"diverse":[109],"selection":[111],"strategies":[112],"conditioning":[115],"leverage":[117],"properties":[120],"of":[121],"quantized":[122],"content":[123],"codes":[124],"prevent":[126],"information":[128],"leakage.":[129],"Additionally,":[130],"compare":[132],"dynamic":[133],"fixed":[135],"delay":[136],"configurations":[137],"explore":[139],"latency-privacy":[140],"trade-offs":[141],"in":[142,155],"real-time":[143],"scenarios.":[144],"Under":[145],"VoicePrivacy":[147],"2024":[148],"Challenge":[149],"protocol,":[150],"Stream-Voice-Anon":[151],"achieves":[152],"substantial":[153],"improvements":[154],"intelligibility":[156],"(up":[157,166],"46%":[159],"relative":[160,195],"WER":[161],"reduction)":[162],"emotion":[164],"preservation":[165],"28%":[168],"UAR":[169],"relative)":[170],"compared":[171],"previous":[174],"state-of-the-art":[175],"method":[177],"DarkStream":[178],"while":[179],"maintaining":[180],"comparable":[181],"latency":[182],"(180ms":[183],"vs":[184],"200ms)":[185],"protection":[188],"against":[189,197],"lazy-informed":[190],"attackers,":[191],"though":[192],"showing":[193],"15%":[194],"degradation":[196],"semi-informed":[198],"attackers.":[199]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-01-22T00:00:00"}
