{"id":"https://openalex.org/W7148483669","doi":"https://doi.org/10.48550/arxiv.2604.00688","title":"OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models","display_name":"OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models","publication_year":2026,"publication_date":"2026-04-01","ids":{"openalex":"https://openalex.org/W7148483669","doi":"https://doi.org/10.48550/arxiv.2604.00688"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00688","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00688","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00688","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132799531","display_name":"Han Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132820688","display_name":"Lingxuan Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Lingxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132809669","display_name":"Wei Kang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132789676","display_name":"Zengwei Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Zengwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132792853","display_name":"Liyong Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Liyong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132821560","display_name":"Fangjun Kuang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kuang, Fangjun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132793434","display_name":"Zhifeng Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Zhifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113987256","display_name":"Weiji Zhuang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuang, Weiji","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132808478","display_name":"Long Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Long","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5084286453","display_name":"Daniel Povey","orcid":"https://orcid.org/0000-0002-0611-3634"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Povey, Daniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.7121000289916992,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.7121000289916992,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.039400000125169754,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.023099999874830246,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.6690999865531921},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.6226000189781189},{"id":"https://openalex.org/keywords/masking","display_name":"Masking (illustration)","score":0.6154999732971191},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5741000175476074},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5485000014305115},{"id":"https://openalex.org/keywords/core","display_name":"Core (optical fiber)","score":0.4968999922275543},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.3756999969482422},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.31200000643730164}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7861999869346619},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.6690999865531921},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.6226000189781189},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.6154999732971191},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5741000175476074},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5485000014305115},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.4968999922275543},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.38519999384880066},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.3756999969482422},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35740000009536743},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.35519999265670776},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.31200000643730164},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2969000041484833},{"id":"https://openalex.org/C2780035574","wikidata":"https://www.wikidata.org/wiki/Q30081","display_name":"Multilingualism","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C179603123","wikidata":"https://www.wikidata.org/wiki/Q1941921","display_name":"Modeling language","level":3,"score":0.2824999988079071},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C39608478","wikidata":"https://www.wikidata.org/wiki/Q5015979","display_name":"Cache language model","level":5,"score":0.2678000032901764},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26269999146461487},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C101722063","wikidata":"https://www.wikidata.org/wiki/Q218825","display_name":"Random access","level":2,"score":0.2529999911785126},{"id":"https://openalex.org/C94922259","wikidata":"https://www.wikidata.org/wiki/Q33215","display_name":"Constructed language","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00688","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00688","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00688","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00688","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7079331278800964}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"OmniVoice,":[2],"a":[3,20,63,75,84],"massively":[4],"multilingual":[5,86,110],"zero-shot":[6],"text-to-speech":[7],"(TTS)":[8],"model":[9],"that":[10,34],"scales":[11],"to":[12,48,78,99],"over":[13],"600":[14],"languages.":[15],"At":[16],"its":[17],"core":[18],"is":[19,55],"novel":[21],"diffusion":[22],"language":[23,97],"model-style":[24],"discrete":[25,31],"non-autoregressive":[26],"(NAR)":[27],"architecture.":[28],"Unlike":[29],"conventional":[30],"NAR":[32],"models":[33,116],"suffer":[35],"from":[36,74,90],"performance":[37,104],"bottlenecks":[38],"in":[39],"complex":[40],"two-stage":[41],"(text-to-semantic-to-acoustic)":[42],"pipelines,":[43],"OmniVoice":[44,93],"directly":[45],"maps":[46],"text":[47],"multi-codebook":[49],"acoustic":[50],"tokens.":[51],"This":[52],"simplified":[53],"approach":[54],"facilitated":[56],"by":[57],"two":[58],"key":[59],"technical":[60],"innovations:":[61],"(1)":[62],"full-codebook":[64],"random":[65],"masking":[66],"strategy":[67],"for":[68],"efficient":[69],"training,":[70],"and":[71,101,108,114],"(2)":[72],"initialization":[73],"pre-trained":[76,115],"LLM":[77],"ensure":[79],"superior":[80],"intelligibility.":[81],"By":[82],"leveraging":[83],"581k-hour":[85],"dataset":[87],"curated":[88],"entirely":[89],"open-source":[91],"data,":[92],"achieves":[94],"the":[95],"broadest":[96],"coverage":[98],"date":[100],"delivers":[102],"state-of-the-art":[103],"across":[105],"Chinese,":[106],"English,":[107],"diverse":[109],"benchmarks.":[111],"Our":[112],"code":[113],"are":[117],"publicly":[118],"available":[119],"at":[120],"https://github.com/k2-fsa/OmniVoice.":[121]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-04-03T00:00:00"}
