{"id":"https://openalex.org/W7134285788","doi":"https://doi.org/10.48550/arxiv.2603.06079","title":"StreamVoiceAnon+: Emotion-Preserving Streaming Speaker Anonymization via Frame-Level Acoustic Distillation","display_name":"StreamVoiceAnon+: Emotion-Preserving Streaming Speaker Anonymization via Frame-Level Acoustic Distillation","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7134285788","doi":"https://doi.org/10.48550/arxiv.2603.06079"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.06079","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5080580365","display_name":"Nikita Kuzmin","orcid":"https://orcid.org/0000-0002-8260-7118"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kuzmin, Nikita","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128500759","display_name":"Kong Aik Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Kong Aik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128547680","display_name":"Eng Siong Chng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chng, Eng Siong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5080580365"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8040000200271606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8040000200271606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.032999999821186066,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.02850000001490116,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5741000175476074},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.4973999857902527},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4300000071525574},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4050000011920929},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.3953999876976013},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3856000006198883},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.38029998540878296},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.36489999294281006},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.3614000082015991},{"id":"https://openalex.org/keywords/upload","display_name":"Upload","score":0.36059999465942383}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8511000275611877},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6736000180244446},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5741000175476074},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.4973999857902527},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4300000071525574},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4050000011920929},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3953999876976013},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3856000006198883},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3846000134944916},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.38029998540878296},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.36489999294281006},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.3614000082015991},{"id":"https://openalex.org/C71901391","wikidata":"https://www.wikidata.org/wiki/Q7126699","display_name":"Upload","level":2,"score":0.36059999465942383},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.35190001130104065},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.34299999475479126},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3345000147819519},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3296999931335449},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.3190999925136566},{"id":"https://openalex.org/C133378560","wikidata":"https://www.wikidata.org/wiki/Q1753225","display_name":"Paralanguage","level":2,"score":0.3005000054836273},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.28630000352859497},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.27630001306533813},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.2630000114440918},{"id":"https://openalex.org/C108699837","wikidata":"https://www.wikidata.org/wiki/Q7120750","display_name":"PSQM","level":4,"score":0.26080000400543213},{"id":"https://openalex.org/C88626702","wikidata":"https://www.wikidata.org/wiki/Q1128903","display_name":"Continuation","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C2776848632","wikidata":"https://www.wikidata.org/wiki/Q853463","display_name":"Clipping (morphology)","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.2572000026702881},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.25679999589920044},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.2526000142097473},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.06079","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.06079","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.06079","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.06079","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,45],"address":[1],"the":[2,33,54,96,118,124],"challenge":[3],"of":[4],"preserving":[5,42],"emotional":[6,30],"content":[7,27],"in":[8],"streaming":[9,93],"speaker":[10],"anonymization":[11],"(SA).":[12],"Neural":[13],"audio":[14,20],"codec":[15],"language":[16],"models":[17],"trained":[18],"for":[19],"continuation":[21],"tend":[22],"to":[23,36,71],"degrade":[24],"source":[25],"emotion:":[26],"tokens":[28],"discard":[29],"information,":[31],"and":[32,82,121,136],"model":[34],"defaults":[35],"dominant":[37],"acoustic":[38,63],"patterns":[39],"rather":[40],"than":[41,76],"paralinguistic":[43],"attributes.":[44],"propose":[46],"supervised":[47],"finetuning":[48],"with":[49,58,108],"neutral-emotion":[50],"utterance":[51],"pairs":[52],"from":[53],"same":[55],"speaker,":[56],"combined":[57],"frame-level":[59],"emotion":[60],"distillation":[61],"on":[62,79],"token":[64],"hidden":[65],"states.":[66],"All":[67],"modifications":[68],"are":[69,138],"confined":[70],"finetuning,":[72],"which":[73],"takes":[74],"less":[75],"2":[77],"hours":[78],"4":[80],"GPUs":[81],"adds":[83],"zero":[84],"inference":[85],"latency":[86],"overhead,":[87],"while":[88,129],"maintaining":[89,130],"a":[90,103,112],"competitive":[91],"180ms":[92],"latency.":[94],"On":[95],"VoicePrivacy":[97],"2024":[98],"protocol,":[99],"our":[100],"approach":[101],"achieves":[102],"49.2%":[104],"UAR":[105,115],"(emotion":[106],"preservation)":[107],"5.77%":[109],"WER":[110],"(intelligibility),":[111],"+24%":[113],"relative":[114],"improvement":[116],"over":[117,123],"baseline":[119],"(39.7%-&gt;49.2%)":[120],"+10%":[122],"emotion-prompt":[125],"variant":[126],"(44.6%":[127],"UAR),":[128],"strong":[131],"privacy":[132],"(EER":[133],"49.0%).":[134],"Demo":[135],"code":[137],"available:":[139],"https://anonymous3842031239.github.io/":[140]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-10T00:00:00"}
