{"id":"https://openalex.org/W7154460998","doi":"https://doi.org/10.48550/arxiv.2604.12456","title":"X-VC: Zero-shot Streaming Voice Conversion in Codec Space","display_name":"X-VC: Zero-shot Streaming Voice Conversion in Codec Space","publication_year":2026,"publication_date":"2026-04-14","ids":{"openalex":"https://openalex.org/W7154460998","doi":"https://doi.org/10.48550/arxiv.2604.12456"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.12456","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12456","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.12456","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129732355","display_name":"Qixi Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Qixi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129683563","display_name":"Yuxiang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yuxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133639735","display_name":"Tianrui Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Tianrui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133706189","display_name":"Wenxi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Wenxi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133649888","display_name":"Kele Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Kele","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133687735","display_name":"Yikang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yikang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101626752","display_name":"Qinyuan Chen","orcid":"https://orcid.org/0009-0009-6211-2044"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Qinyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133644635","display_name":"Xipeng Qiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiu, Xipeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133669287","display_name":"Kai Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Kai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133718350","display_name":"Xie Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Xie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9315999746322632,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9315999746322632,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.021400000900030136,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.011300000362098217,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.651199996471405},{"id":"https://openalex.org/keywords/smoothing","display_name":"Smoothing","score":0.5435000061988831},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.453900009393692},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.4388999938964844},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.42579999566078186},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.3725000023841858},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.3691999912261963},{"id":"https://openalex.org/keywords/acoustic-space","display_name":"Acoustic space","score":0.3409000039100647}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8098000288009644},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6557000279426575},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.651199996471405},{"id":"https://openalex.org/C3770464","wikidata":"https://www.wikidata.org/wiki/Q775963","display_name":"Smoothing","level":2,"score":0.5435000061988831},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.453900009393692},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.4388999938964844},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.42579999566078186},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3725000023841858},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.3691999912261963},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3458999991416931},{"id":"https://openalex.org/C108250783","wikidata":"https://www.wikidata.org/wiki/Q4674710","display_name":"Acoustic space","level":3,"score":0.3409000039100647},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.3346000015735626},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.33329999446868896},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.3264000117778778},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.3172999918460846},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2750000059604645},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.26899999380111694},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.2612999975681305},{"id":"https://openalex.org/C2781039887","wikidata":"https://www.wikidata.org/wiki/Q1391724","display_name":"Factor (programming language)","level":2,"score":0.25699999928474426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.12456","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12456","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.12456","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12456","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.47693610191345215,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Zero-shot":[0],"voice":[1,12],"conversion":[2,28,66,197],"(VC)":[3],"aims":[4],"to":[5,49],"convert":[6],"a":[7,58,72,78,123,139,199],"source":[8,85],"utterance":[9],"into":[10],"the":[11,68,108,116,150,155,164,188],"of":[13,71,154],"an":[14],"unseen":[15],"target":[16,94,100],"speaker":[17,41,101,174],"while":[18,97],"preserving":[19],"its":[20],"linguistic":[21],"content.":[22],"Although":[23],"recent":[24],"systems":[25,33],"have":[26],"improved":[27],"quality,":[29],"building":[30,203],"zero-shot":[31,59,206],"VC":[32,61,207],"for":[34,202],"interactive":[35],"scenarios":[36],"remains":[37],"challenging":[38],"because":[39],"high-fidelity":[40],"transfer":[42],"and":[43,88,112,122,130,171,178,181,213],"low-latency":[44,205],"streaming":[45,60,134,166],"inference":[46,141],"are":[47,215],"difficult":[48],"achieve":[50],"simultaneously.":[51],"In":[52],"this":[53],"work,":[54],"we":[55,114,136],"present":[56],"X-VC,":[57],"system":[62],"that":[63,82,126,146,161,194],"performs":[64],"one-step":[65,196],"in":[67,168,176],"latent":[69],"space":[70],"pretrained":[73],"neural":[74],"codec.":[75,156],"X-VC":[76,162],"uses":[77],"dual-conditioning":[79],"acoustic":[80,90],"converter":[81],"jointly":[83],"models":[84],"codec":[86],"latents":[87],"frame-level":[89],"conditions":[91],"derived":[92],"from":[93],"reference":[95],"speech,":[96],"injecting":[98],"utterance-level":[99],"information":[102],"through":[103],"adaptive":[104],"normalization.":[105],"To":[106],"reduce":[107],"mismatch":[109],"between":[110],"training":[111,152],"inference,":[113,135],"train":[115],"model":[117],"with":[118,143,149],"generated":[119],"paired":[120],"data":[121],"role-assignment":[124],"strategy":[125],"combines":[127],"standard,":[128],"reconstruction,":[129],"reversed":[131],"modes.":[132],"For":[133],"further":[137],"adopt":[138],"chunkwise":[140],"scheme":[142],"overlap":[144],"smoothing":[145],"is":[147,198],"aligned":[148],"segment-based":[151],"paradigm":[153],"Experiments":[157],"on":[158],"Seed-TTS-Eval":[159],"show":[160],"achieves":[163],"best":[165],"WER":[167],"both":[169],"English":[170],"Chinese,":[172],"strong":[173],"similarity":[175],"same-language":[177],"cross-lingual":[179],"settings,":[180],"substantially":[182],"lower":[183],"offline":[184],"real-time":[185],"factor":[186],"than":[187],"compared":[189],"baselines.":[190],"These":[191],"results":[192],"suggest":[193],"codec-space":[195],"practical":[200],"approach":[201],"high-quality":[204],"systems.":[208],"Our":[209],"audio":[210],"samples,":[211],"code":[212],"checkpoints":[214],"released":[216],"at":[217],"https://github.com/Jerrister/X-VC.":[218]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-16T00:00:00"}
