{"id":"https://openalex.org/W4415540898","doi":"https://doi.org/10.1145/3746027.3754745","title":"Pseudo-Autoregressive Neural Codec Language Models for Efficient Zero-Shot Text-to-Speech Synthesis","display_name":"Pseudo-Autoregressive Neural Codec Language Models for Efficient Zero-Shot Text-to-Speech Synthesis","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415540898","doi":"https://doi.org/10.1145/3746027.3754745"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3754745","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754745","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5115596565","display_name":"Yifan Yang","orcid":"https://orcid.org/0009-0003-0588-1812"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yifan Yang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0003-0588-1812","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101635405","display_name":"Shujie Liu","orcid":"https://orcid.org/0009-0008-2599-6752"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shujie Liu","raw_affiliation_strings":["Microsoft Corporation, Hong Kong, Hong Kong"],"raw_orcid":"https://orcid.org/0009-0008-2599-6752","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365053","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-1089-9748"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0000-0002-1089-9748","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yuxuan Hu","orcid":"https://orcid.org/0009-0004-5688-7488"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yuxuan Hu","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0009-0004-5688-7488","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101474128","display_name":"Haibin Wu","orcid":"https://orcid.org/0000-0001-7166-5534"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haibin Wu","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0000-0001-7166-5534","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115597140","display_name":"Hui Wang","orcid":"https://orcid.org/0009-0003-8057-4644"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hui Wang","raw_affiliation_strings":["Microsoft Corporation, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0003-8057-4644","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004643540","display_name":"Jianwei Yu","orcid":"https://orcid.org/0000-0002-2449-1436"},"institutions":[{"id":"https://openalex.org/I4210153468","display_name":"Microsoft (Canada)","ror":"https://ror.org/04xhxg104","country_code":"CA","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210153468"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Jianwei Yu","raw_affiliation_strings":["Microsoft Corporation, Vancouver, BC, Canada"],"raw_orcid":"https://orcid.org/0000-0002-2449-1436","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Vancouver, BC, Canada","institution_ids":["https://openalex.org/I4210153468"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089153961","display_name":"Lingwei Meng","orcid":"https://orcid.org/0000-0003-1028-6017"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lingwei Meng","raw_affiliation_strings":["Microsoft Corporation, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-1028-6017","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066697166","display_name":"Haiyang Sun","orcid":"https://orcid.org/0009-0004-3485-3869"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haiyang Sun","raw_affiliation_strings":["Microsoft Corporation, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0004-3485-3869","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yanqing Liu","orcid":"https://orcid.org/0000-0002-4150-0680"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanqing Liu","raw_affiliation_strings":["Microsoft Corporation, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-4150-0680","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100756584","display_name":"Yan Lu","orcid":"https://orcid.org/0000-0001-5383-6424"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yan Lu","raw_affiliation_strings":["Microsoft Corporation, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5383-6424","affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043098653","display_name":"Kai Yu","orcid":"https://orcid.org/0000-0002-7102-9826"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Yu","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-7102-9826","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101293966","display_name":"Xie Chen","orcid":"https://orcid.org/0000-0001-7423-617X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xie Chen","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai Innovation Institute, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-7423-617X","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai Innovation Institute, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":13,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14518133,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"9316","last_page":"9325"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9898999929428101,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.6338000297546387},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.6226999759674072},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5605999827384949},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5424000024795532},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.4821999967098236},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.41839998960494995},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.392300009727478},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.3686999976634979}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8240000009536743},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.6338000297546387},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6291999816894531},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.6226999759674072},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5605999827384949},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5424000024795532},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.4821999967098236},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42829999327659607},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.41839998960494995},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.392300009727478},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.3686999976634979},{"id":"https://openalex.org/C112758219","wikidata":"https://www.wikidata.org/wiki/Q16038819","display_name":"Duration (music)","level":2,"score":0.34040001034736633},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3296999931335449},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.26919999718666077},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.2590999901294708},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2574999928474426},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.2567000091075897},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.25519999861717224}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3754745","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754745","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G6447785163","display_name":null,"funder_award_id":"No. 62206171 and No. U23B2018","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":4,"referenced_works":["https://openalex.org/W3215615641","https://openalex.org/W4390075359","https://openalex.org/W4406417959","https://openalex.org/W4413343955"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"zero-shot":[1],"text-to-speech":[2],"(TTS)":[3],"systems":[4,142],"face":[5],"a":[6,37,76],"common":[7],"dilemma:":[8],"autoregressive":[9],"(AR)":[10],"models":[11,23],"suffer":[12],"from":[13,55,60],"slow":[14],"generation":[15,59,85],"and":[16,27,48,150,164],"lack":[17,24],"duration":[18],"controllability,":[19],"while":[20,166],"non-autoregressive":[21],"(NAR)":[22],"temporal":[25,53],"modeling":[26,43,54],"typically":[28],"require":[29],"complex":[30],"designs.":[31],"In":[32,90,117],"this":[33],"paper,":[34],"we":[35,73],"introduce":[36],"novel":[38],"pseudo-autoregressive":[39],"(PAR)":[40],"codec":[41],"language":[42],"approach":[44],"that":[45,80,135],"unifies":[46],"AR":[47,56],"NAR":[49,88],"modeling.":[50],"Combining":[51],"explicit":[52],"with":[57,103],"parallel":[58,110],"NAR,":[61],"PAR":[62,82,94],"generates":[63,96],"dynamic-length":[64],"spans":[65],"at":[66,179],"fixed":[67],"time":[68,101],"steps.":[69],"Building":[70],"on":[71,138,144,152],"PAR,":[72],"propose":[74],"PALLE,":[75,136],"two-stage":[77],"TTS":[78],"system":[79],"leverages":[81],"for":[83],"initial":[84],"followed":[86],"by":[87],"refinement.":[89],"the":[91,100,114,118,129,153],"first":[92],"stage,":[93,120],"progressively":[95],"speech":[97,160],"tokens":[98,122],"along":[99],"dimension,":[102],"each":[104],"step":[105],"predicting":[106],"all":[107],"positions":[108],"in":[109,126,157],"but":[111],"only":[112],"retaining":[113],"left-most":[115],"span.":[116],"second":[119],"low-confidence":[121],"are":[123,177],"iteratively":[124],"refined":[125],"parallel,":[127],"leveraging":[128],"global":[130],"contextual":[131],"information.":[132],"Experiments":[133],"demonstrate":[134],"trained":[137,143],"LibriTTS,":[139],"outperforms":[140],"state-of-the-art":[141],"large-scale":[145],"data,":[146],"including":[147],"F5-TTS,":[148],"E2-TTS,":[149],"MaskGCT,":[151],"LibriSpeech":[154],"test-clean":[155],"set":[156],"terms":[158],"of":[159],"quality,":[161],"speaker":[162],"similarity,":[163],"intelligibility,":[165],"achieving":[167],"up":[168],"to":[169],"ten":[170],"times":[171],"faster":[172],"inference":[173],"speed.":[174],"Audio":[175],"samples":[176],"available":[178],"https://microsoft.com/research/project/vall-e-x/palle.":[180]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-25T00:00:00"}
