{"id":"https://openalex.org/W7148430036","doi":"https://doi.org/10.1109/asru65441.2025.11433841","title":"ZipVoice: Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching","display_name":"ZipVoice: Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148430036","doi":"https://doi.org/10.1109/asru65441.2025.11433841"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11433841","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11433841","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121617127","display_name":"Han Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Han Zhu","raw_affiliation_strings":["Xiaomi Corp,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corp,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132802820","display_name":"Wei Kang","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Kang","raw_affiliation_strings":["Xiaomi Corp,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corp,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132822827","display_name":"Zengwei Yao","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zengwei Yao","raw_affiliation_strings":["Xiaomi Corp,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corp,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132803267","display_name":"Liyong Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liyong Guo","raw_affiliation_strings":["Xiaomi Corp,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corp,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132801235","display_name":"Fangjun Kuang","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fangjun Kuang","raw_affiliation_strings":["Xiaomi Corp,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corp,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121682776","display_name":"Zhaoqing Li","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaoqing Li","raw_affiliation_strings":["Xiaomi Corp,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corp,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113987256","display_name":"Weiji Zhuang","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiji Zhuang","raw_affiliation_strings":["Xiaomi Corp,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corp,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132821138","display_name":"Long Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Lin","raw_affiliation_strings":["Xiaomi Corp,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corp,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132793753","display_name":"Daniel Povey","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Daniel Povey","raw_affiliation_strings":["Xiaomi Corp,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corp,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5121617127"],"corresponding_institution_ids":["https://openalex.org/I862669128"],"apc_list":null,"apc_paid":null,"fwci":4.3637,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.95468257,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.941100001335144,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.941100001335144,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.007699999958276749,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.007600000128149986,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/estimator","display_name":"Estimator","score":0.6879000067710876},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6474999785423279},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5450999736785889},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5309000015258789},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4912000000476837},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.46639999747276306},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.43880000710487366},{"id":"https://openalex.org/keywords/flow","display_name":"Flow (mathematics)","score":0.42080000042915344}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.713699996471405},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.6879000067710876},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6474999785423279},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5450999736785889},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5309000015258789},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4912000000476837},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.46639999747276306},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.43880000710487366},{"id":"https://openalex.org/C38349280","wikidata":"https://www.wikidata.org/wiki/Q1434290","display_name":"Flow (mathematics)","level":2,"score":0.42080000042915344},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.4153999984264374},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.38449999690055847},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3646000027656555},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.35370001196861267},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3334999978542328},{"id":"https://openalex.org/C55166926","wikidata":"https://www.wikidata.org/wiki/Q2892946","display_name":"Oracle","level":2,"score":0.3271999955177307},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.3183000087738037},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.31439998745918274},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.3116999864578247},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.30059999227523804},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27469998598098755},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2676999866962433},{"id":"https://openalex.org/C167928553","wikidata":"https://www.wikidata.org/wiki/Q1376021","display_name":"Estimation theory","level":2,"score":0.2590000033378601},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2554999887943268},{"id":"https://openalex.org/C134261354","wikidata":"https://www.wikidata.org/wiki/Q938438","display_name":"Statistical inference","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11433841","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11433841","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2972359262","https://openalex.org/W3015282541","https://openalex.org/W3024869864","https://openalex.org/W3097777922","https://openalex.org/W3160919572","https://openalex.org/W3163132306","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4225956675","https://openalex.org/W4252812408","https://openalex.org/W4283067311","https://openalex.org/W4385245566","https://openalex.org/W4386071831","https://openalex.org/W4386076493","https://openalex.org/W4387968164","https://openalex.org/W4391021797","https://openalex.org/W4392903704","https://openalex.org/W4392904491","https://openalex.org/W4392931276","https://openalex.org/W4392931781","https://openalex.org/W4402111456","https://openalex.org/W4402715897","https://openalex.org/W4403791830","https://openalex.org/W4406417959","https://openalex.org/W4406461271","https://openalex.org/W4406461672","https://openalex.org/W4406461681","https://openalex.org/W4408347240","https://openalex.org/W4409090500","https://openalex.org/W4412945617","https://openalex.org/W7133245521"],"related_works":[],"abstract_inverted_index":{"Existing":[0],"large-scale":[1],"zero-shot":[2,31],"text-to-speech":[3],"(TTS)":[4],"models":[5,104],"deliver":[6],"high":[7],"speech":[8,72,106],"quality":[9],"but":[10],"suffer":[11],"from":[12],"slow":[13],"inference":[14,41,86],"speeds":[15],"due":[16],"to":[17,52,70,79,115],"massive":[18],"parameters.":[19],"To":[20],"address":[21],"this":[22,24],"issue,":[23],"paper":[25],"introduces":[26],"ZipVoice,":[27],"a":[28,35,47,120],"high-quality":[29],"flow-matching-based":[30],"TTS":[32],"model":[33,37,125],"with":[34,89],"compact":[36],"size":[38],"and":[39,66,83,113,127],"fast":[40],"speed.":[42],"Key":[43],"designs":[44],"include:":[45],"1)":[46],"Zipformer-based":[48,67],"vector":[49],"field":[50],"estimator":[51],"maintain":[53],"adequate":[54],"modeling":[55],"capabilities":[56],"under":[57],"constrained":[58],"size;":[59],"2)":[60],"Average":[61],"upsampling-based":[62],"initial":[63],"speech-text":[64],"alignment":[65],"text":[68],"encoder":[69],"improve":[71],"intelligibility;":[73],"3)":[74],"A":[75],"flow":[76],"distillation":[77],"method":[78],"reduce":[80],"sampling":[81],"steps":[82],"eliminate":[84],"the":[85],"overhead":[87],"associated":[88],"classifier-free":[90],"guidance.":[91],"Experiments":[92],"on":[93],"100":[94],"k":[95],"hours":[96],"multilingual":[97],"datasets":[98],"show":[99],"that":[100],"ZipVoice":[101],"matches":[102],"state-of-the-art":[103],"in":[105],"quality,":[107],"while":[108],"being":[109],"3":[110],"times":[111,117],"smaller":[112],"up":[114],"30":[116],"faster":[118],"than":[119],"DiT-based":[121],"flow-matching":[122],"baseline.":[123],"Codes,":[124],"checkpoints":[126],"demo":[128],"samples":[129],"are":[130],"publicly":[131],"available.<sup":[132],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[133,135],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup><sup":[134],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>https://github.com/k2-fsa/ZipVoice":[136]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2026-04-03T00:00:00"}
