{"id":"https://openalex.org/W4415536855","doi":"https://doi.org/10.1145/3746027.3755816","title":"Speech Token Prediction via Compressed-to-fine Language Modeling for Speech Generation","display_name":"Speech Token Prediction via Compressed-to-fine Language Modeling for Speech Generation","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415536855","doi":"https://doi.org/10.1145/3746027.3755816"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755816","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755816","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059916277","display_name":"Wenrui Liu","orcid":"https://orcid.org/0009-0000-5940-5369"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenrui Liu","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0000-5940-5369","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100428431","display_name":"Qian Chen","orcid":"https://orcid.org/0000-0001-6939-7438"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qian Chen","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-6939-7438","affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100396375","display_name":"Wen Wang","orcid":"https://orcid.org/0000-0002-0356-1968"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wen Wang","raw_affiliation_strings":["Alibaba Group, Sunnyvale, California, USA"],"raw_orcid":"https://orcid.org/0000-0002-0356-1968","affiliations":[{"raw_affiliation_string":"Alibaba Group, Sunnyvale, California, USA","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111310847","display_name":"Guanrou Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guanrou Yang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0008-3614-1346","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113851573","display_name":"Weiqin Li","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiqin Li","raw_affiliation_strings":["Tsinghua University, Shenzhen, China"],"raw_orcid":"https://orcid.org/0009-0001-0317-727X","affiliations":[{"raw_affiliation_string":"Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113394692","display_name":"Minghui Fang","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minghui Fang","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0000-6488-9695","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101230918","display_name":"Jialong Zuo","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jialong Zuo","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0002-6876-9943","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103359200","display_name":"Xiaoda Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoda Yang","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0002-7297-4536","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102011966","display_name":"Tao Jin","orcid":"https://orcid.org/0000-0003-3564-1628"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Jin","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-3564-1628","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103012751","display_name":"Jin Xu","orcid":"https://orcid.org/0000-0002-1409-6731"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jin Xu","raw_affiliation_strings":["Alibaba Group, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-1409-6731","affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030809906","display_name":"Zemin Liu","orcid":"https://orcid.org/0000-0001-6262-9435"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zemin Liu","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-6262-9435","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019204904","display_name":"Ya\u2010Feng Chen","orcid":"https://orcid.org/0009-0004-4446-1391"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yafeng Chen","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0004-4446-1391","affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102556618","display_name":"Jionghao Bai","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jionghao Bai","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0005-7106-513X","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102221957","display_name":"Zhifang Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhifang Guo","raw_affiliation_strings":["Alibaba Group, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-6037-2299","affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5059916277"],"corresponding_institution_ids":["https://openalex.org/I76130692"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15583968,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"10632","last_page":"10641"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.8084999918937683},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.5774999856948853},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5622000098228455},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.4880000054836273},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.42500001192092896},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4004000127315521},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.39800000190734863}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8095999956130981},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.8084999918937683},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6682999730110168},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.5774999856948853},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5622000098228455},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.4880000054836273},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.42500001192092896},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4004000127315521},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.39800000190734863},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.37389999628067017},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.36500000953674316},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35740000009536743},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.3504999876022339},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34060001373291016},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3400999903678894},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.288100004196167},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.2556000053882599}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755816","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755816","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1968558813","https://openalex.org/W2805866554","https://openalex.org/W2972359262","https://openalex.org/W3215615641","https://openalex.org/W4402112533","https://openalex.org/W4406417959","https://openalex.org/W4406513948"],"related_works":[],"abstract_inverted_index":{"Neural":[0],"audio":[1,21,24,28,172,205],"codecs,":[2],"used":[3],"as":[4],"speech":[5,15,33,49,63,113,195],"tokenizers,":[6],"have":[7],"demonstrated":[8],"remarkable":[9],"potential":[10],"in":[11,43,64,193],"the":[12,57,68,71,86,108,129,141,179,184,188],"field":[13],"of":[14,32,70,110,143,183,190,204],"generation.":[16],"However,":[17],"to":[18,56,85,106,136,159],"ensure":[19,137],"high-fidelity":[20],"reconstruction,":[22],"neural":[23,116,171,198],"codecs":[25,173],"typically":[26],"encode":[27],"into":[29,156],"long":[30,111],"sequences":[31,51],"tokens,":[34],"posing":[35],"a":[36,101],"significant":[37],"challenge":[38,109],"for":[39],"downstream":[40,175],"language":[41,103,118,176,200],"models":[42,177],"long-context":[44],"modeling.":[45],"We":[46],"observe":[47],"that":[48],"token":[50,73,88,154,191],"exhibit":[52],"short-range":[53],"dependency:":[54],"due":[55],"monotonic":[58],"alignment":[59,139],"between":[60],"text":[61,138],"and":[62,90,123,131,140,174,181],"text-to-speech":[65],"(TTS)":[66],"tasks,":[67],"prediction":[69,89,135],"current":[72,87],"primarily":[74],"relies":[75],"on":[76,169],"its":[77],"local":[78,132],"context,":[79],"while":[80,163],"long-range":[81,153],"tokens":[82,114,133],"contribute":[83],"less":[84],"often":[91],"contain":[92],"redundant":[93,161],"information.":[94],"Inspired":[95],"by":[96],"this":[97],"observation,":[98],"we":[99],"propose":[100],"compressed-to-fine":[102],"modeling":[104],"approach":[105,127,151],"address":[107],"sequence":[112],"within":[115,197],"codec":[117,199],"models:":[119],"(1)":[120],"Fine-grained":[121],"Initial":[122],"Short-range":[124],"Information:":[125],"Our":[126,150],"retains":[128],"prompt":[130],"during":[134],"integrity":[142],"paralinguistic":[144],"information;":[145],"(2)":[146],"Compressed":[147],"Long-range":[148],"Context:":[149],"compresses":[152],"spans":[155],"compact":[157],"representations":[158],"reduce":[160],"information":[162],"preserving":[164],"essential":[165],"semantics.":[166],"Extensive":[167],"experiments":[168],"various":[170],"validate":[178],"effectiveness":[180],"generalizability":[182],"proposed":[185],"approach,":[186],"highlighting":[187],"importance":[189],"compression":[192],"improving":[194],"generation":[196],"models.":[201],"The":[202],"demo":[203],"samples":[206],"will":[207],"be":[208],"available":[209],"at":[210],"https://anonymous.4open.science/r/SpeechTokenPredictionViaCompressedToFinedLM.":[211]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-25T00:00:00"}
