{"id":"https://openalex.org/W4413343955","doi":"https://doi.org/10.1109/lsp.2025.3600376","title":"StreamMel: Real-Time Zero-Shot Text-to-Speech Via Interleaved Continuous Autoregressive Modeling","display_name":"StreamMel: Real-Time Zero-Shot Text-to-Speech Via Interleaved Continuous Autoregressive Modeling","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4413343955","doi":"https://doi.org/10.1109/lsp.2025.3600376"},"language":"en","primary_location":{"id":"doi:10.1109/lsp.2025.3600376","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2025.3600376","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100460850","display_name":"Hui Wang","orcid":"https://orcid.org/0000-0002-8814-1988"},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hui Wang","raw_affiliation_strings":["College of Computer Science, Nankai University, Tianjin, China","College of Computer Science, Nankai University, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science, Nankai University, Tianjin, China","institution_ids":["https://openalex.org/I205237279"]},{"raw_affiliation_string":"College of Computer Science, Nankai University, China","institution_ids":["https://openalex.org/I205237279"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100743975","display_name":"Y. Yang","orcid":"https://orcid.org/0000-0002-8917-2620"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yifan Yang","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA","Microsoft Corporation, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Corporation, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034770439","display_name":"Shujie Liu","orcid":"https://orcid.org/0000-0003-0012-5199"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shujie Liu","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA","Microsoft Corporation, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Corporation, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365056","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-5206-8600"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA","Microsoft Corporation, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Corporation, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089153961","display_name":"Lingwei Meng","orcid":"https://orcid.org/0000-0003-1028-6017"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lingwei Meng","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA","Microsoft Corporation, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Corporation, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100360923","display_name":"Yanqing Liu","orcid":"https://orcid.org/0000-0002-4150-0680"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yanqing Liu","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA","Microsoft Corporation, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Corporation, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086127472","display_name":"Jiaming Zhou","orcid":"https://orcid.org/0000-0002-9298-4347"},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiaming Zhou","raw_affiliation_strings":["College of Computer Science, Nankai University, Tianjin, China","College of Computer Science, Nankai University, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science, Nankai University, Tianjin, China","institution_ids":["https://openalex.org/I205237279"]},{"raw_affiliation_string":"College of Computer Science, Nankai University, China","institution_ids":["https://openalex.org/I205237279"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035524175","display_name":"Haoqin Sun","orcid":"https://orcid.org/0000-0002-8554-8969"},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoqin Sun","raw_affiliation_strings":["College of Computer Science, Nankai University, Tianjin, China","College of Computer Science, Nankai University, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science, Nankai University, Tianjin, China","institution_ids":["https://openalex.org/I205237279"]},{"raw_affiliation_string":"College of Computer Science, Nankai University, China","institution_ids":["https://openalex.org/I205237279"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100756584","display_name":"Yan Lu","orcid":"https://orcid.org/0000-0001-5383-6424"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yan Lu","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA","Microsoft Corporation, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"Microsoft Corporation, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102006082","display_name":"Yong Qin","orcid":"https://orcid.org/0000-0001-9808-0456"},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yong Qin","raw_affiliation_strings":["College of Computer Science, Nankai University, Tianjin, China","College of Computer Science, Nankai University, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science, Nankai University, Tianjin, China","institution_ids":["https://openalex.org/I205237279"]},{"raw_affiliation_string":"College of Computer Science, Nankai University, China","institution_ids":["https://openalex.org/I205237279"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5100460850"],"corresponding_institution_ids":["https://openalex.org/I205237279"],"apc_list":null,"apc_paid":null,"fwci":2.8414,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.9242019,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"32","issue":null,"first_page":"3530","last_page":"3534"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.7550339102745056},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.685882031917572},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6387875080108643},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.560889482498169},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.5376874208450317},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4444430470466614},{"id":"https://openalex.org/keywords/star-model","display_name":"STAR model","score":0.4286656975746155},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3518489599227905},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3230511248111725},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1761619746685028},{"id":"https://openalex.org/keywords/time-series","display_name":"Time series","score":0.16939809918403625},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.14372310042381287},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.1092299222946167},{"id":"https://openalex.org/keywords/autoregressive-integrated-moving-average","display_name":"Autoregressive integrated moving average","score":0.07045692205429077}],"concepts":[{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.7550339102745056},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.685882031917572},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6387875080108643},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.560889482498169},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.5376874208450317},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4444430470466614},{"id":"https://openalex.org/C194657046","wikidata":"https://www.wikidata.org/wiki/Q7394685","display_name":"STAR model","level":4,"score":0.4286656975746155},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3518489599227905},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3230511248111725},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1761619746685028},{"id":"https://openalex.org/C151406439","wikidata":"https://www.wikidata.org/wiki/Q186588","display_name":"Time series","level":2,"score":0.16939809918403625},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.14372310042381287},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.1092299222946167},{"id":"https://openalex.org/C24338571","wikidata":"https://www.wikidata.org/wiki/Q2566298","display_name":"Autoregressive integrated moving average","level":3,"score":0.07045692205429077},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lsp.2025.3600376","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2025.3600376","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.5199999809265137}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2963609956","https://openalex.org/W3097777922","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4381786045","https://openalex.org/W4385823334","https://openalex.org/W4392903704","https://openalex.org/W4392904154","https://openalex.org/W4406417959","https://openalex.org/W4406461681","https://openalex.org/W4408592567","https://openalex.org/W4412886819","https://openalex.org/W4412945617"],"related_works":["https://openalex.org/W2439807930","https://openalex.org/W2009692134","https://openalex.org/W1972271943","https://openalex.org/W2019155478","https://openalex.org/W2024529895","https://openalex.org/W2168175994","https://openalex.org/W1902630399","https://openalex.org/W2120434453","https://openalex.org/W3120578569","https://openalex.org/W1487412319"],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,95],"zero-shot":[3],"text-to-speech":[4],"(TTS)":[5],"synthesis":[6,76],"have":[7],"achieved":[8],"high-quality":[9],"speech":[10,120],"generation":[11],"for":[12,20,116],"unseen":[13],"speakers,":[14],"but":[15],"most":[16],"systems":[17,107],"remain":[18],"unsuitable":[19],"real-time":[21,111,119],"applications":[22],"because":[23],"of":[24],"their":[25],"offline":[26,106],"design.":[27],"Current":[28],"streaming":[29,58,92],"TTS":[30,59,93],"paradigms":[31],"often":[32],"rely":[33],"on":[34,85],"multi-stage":[35],"pipelines":[36],"and":[37,45,82,98],"discrete":[38],"representations,":[39],"leading":[40],"to":[41,105],"increased":[42],"computational":[43],"cost":[44],"suboptimal":[46],"system":[47],"performance.":[48],"In":[49],"this":[50],"work,":[51],"we":[52],"propose":[53],"StreamMel,":[54],"a":[55],"pioneering":[56],"single-stage":[57],"framework":[60],"that":[61,88],"models":[62],"continuous":[63],"mel-spectrograms.":[64],"By":[65],"interleaving":[66],"text":[67],"tokens":[68],"with":[69,118],"acoustic":[70],"frames,":[71],"StreamMel":[72,89],"enables":[73],"low-latency,":[74],"autoregressive":[75],"while":[77,108],"preserving":[78],"high":[79],"speaker":[80],"similarity":[81],"naturalness.":[83],"Experiments":[84],"LibriSpeech":[86],"demonstrate":[87],"outperforms":[90],"existing":[91],"baselines":[94],"both":[96],"quality":[97],"latency.":[99],"It":[100],"even":[101],"achieves":[102],"performance":[103],"comparable":[104],"supporting":[109],"efficient":[110],"generation,":[112],"showcasing":[113],"broad":[114],"prospects":[115],"integration":[117],"large":[121],"language":[122],"models.":[123],"Audio":[124],"samples":[125],"are":[126],"available":[127],"at:":[128],"<uri":[129],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[130],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">https://aka.ms/StreamMel</uri>.":[131]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
