{"id":"https://openalex.org/W4406495604","doi":"https://doi.org/10.1109/slt61566.2024.10832266","title":"Investigating Neural Audio Codecs For Speech Language Model-Based Speech Generation","display_name":"Investigating Neural Audio Codecs For Speech Language Model-Based Speech Generation","publication_year":2024,"publication_date":"2024-12-02","ids":{"openalex":"https://openalex.org/W4406495604","doi":"https://doi.org/10.1109/slt61566.2024.10832266"},"language":"en","primary_location":{"id":"doi:10.1109/slt61566.2024.10832266","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832266","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100325927","display_name":"Jiaqi Li","orcid":"https://orcid.org/0000-0001-8010-9636"},"institutions":[{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]},{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["CN","US"],"is_corresponding":true,"raw_author_name":"Jiaqi Li","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA","The Chinese University of Hong Kong,Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"The Chinese University of Hong Kong,Shenzhen,China","institution_ids":["https://openalex.org/I4210116924"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100408281","display_name":"Dongmei Wang","orcid":"https://orcid.org/0000-0002-6930-0066"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dongmei Wang","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100363489","display_name":"Xiaofei Wang","orcid":"https://orcid.org/0009-0004-6683-3969"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaofei Wang","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053009352","display_name":"Yao Qian","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yao Qian","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106557565","display_name":"Long Zhou","orcid":"https://orcid.org/0009-0006-1919-4943"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Long Zhou","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101635405","display_name":"Shujie Liu","orcid":"https://orcid.org/0009-0008-0785-8882"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shujie Liu","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112853229","display_name":"Midia Yousefi","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Midia Yousefi","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112416737","display_name":"Canrun Li","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Canrun Li","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101375070","display_name":"Chung-Hsien Tsai","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chung-Hsien Tsai","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033383677","display_name":"Zhen Xiao","orcid":"https://orcid.org/0000-0003-3832-3916"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhen Xiao","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100360930","display_name":"Yanqing Liu","orcid":"https://orcid.org/0000-0002-3520-1887"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yanqing Liu","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034556928","display_name":"Junkun Chen","orcid":"https://orcid.org/0000-0003-0193-238X"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Junkun Chen","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113233256","display_name":"Sheng Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sheng Zhao","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365056","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-5206-8600"},"institutions":[{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]},{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA","The Chinese University of Hong Kong,Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]},{"raw_affiliation_string":"The Chinese University of Hong Kong,Shenzhen,China","institution_ids":["https://openalex.org/I4210116924"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102765381","display_name":"Zhizheng Wu","orcid":"https://orcid.org/0009-0001-1192-9857"},"institutions":[{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhizheng Wu","raw_affiliation_strings":["The Chinese University of Hong Kong,Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong,Shenzhen,China","institution_ids":["https://openalex.org/I4210116924"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5089195158","display_name":"Michael Zeng","orcid":"https://orcid.org/0000-0001-5302-5883"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael Zeng","raw_affiliation_strings":["Microsoft, One Microsoft Way,Redmond,WA,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, One Microsoft Way,Redmond,WA,USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":16,"corresponding_author_ids":["https://openalex.org/A5100325927"],"corresponding_institution_ids":["https://openalex.org/I1290206253","https://openalex.org/I4210116924"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.2389035,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"554","last_page":"561"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8222038745880127},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7669678926467896},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.5693637728691101},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.5091027617454529},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4908915162086487},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.48803988099098206},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.4758310616016388},{"id":"https://openalex.org/keywords/speech-technology","display_name":"Speech technology","score":0.44363951683044434},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4360229969024658},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.42893919348716736},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.41936951875686646},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32138723134994507},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.06329792737960815}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8222038745880127},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7669678926467896},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.5693637728691101},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.5091027617454529},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4908915162086487},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.48803988099098206},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.4758310616016388},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.44363951683044434},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4360229969024658},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.42893919348716736},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.41936951875686646},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32138723134994507},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.06329792737960815}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt61566.2024.10832266","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832266","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":54,"referenced_works":["https://openalex.org/W569478347","https://openalex.org/W1494198834","https://openalex.org/W1552314771","https://openalex.org/W2067295501","https://openalex.org/W2107860279","https://openalex.org/W2593463961","https://openalex.org/W2752796333","https://openalex.org/W2914304175","https://openalex.org/W2990138404","https://openalex.org/W3030437843","https://openalex.org/W3034302232","https://openalex.org/W3037038648","https://openalex.org/W3196475561","https://openalex.org/W3209984917","https://openalex.org/W3215615641","https://openalex.org/W4226033575","https://openalex.org/W4252337780","https://openalex.org/W4288099666","https://openalex.org/W4292779060","https://openalex.org/W4307323391","https://openalex.org/W4313021454","https://openalex.org/W4313679638","https://openalex.org/W4367016628","https://openalex.org/W4377010126","https://openalex.org/W4381786045","https://openalex.org/W4381827575","https://openalex.org/W4382603054","https://openalex.org/W4392019859","https://openalex.org/W4392903006","https://openalex.org/W4393969054","https://openalex.org/W4394671563","https://openalex.org/W4399554695","https://openalex.org/W4402111239","https://openalex.org/W4402670057","https://openalex.org/W4402915905","https://openalex.org/W4404740148","https://openalex.org/W6769196770","https://openalex.org/W6771467084","https://openalex.org/W6771812881","https://openalex.org/W6778883912","https://openalex.org/W6783867762","https://openalex.org/W6790356757","https://openalex.org/W6840815571","https://openalex.org/W6848735303","https://openalex.org/W6853165267","https://openalex.org/W6853188576","https://openalex.org/W6853515095","https://openalex.org/W6853888607","https://openalex.org/W6853998256","https://openalex.org/W6860273036","https://openalex.org/W6861548716","https://openalex.org/W6863379155","https://openalex.org/W6869227545","https://openalex.org/W6869425021"],"related_works":["https://openalex.org/W1911859126","https://openalex.org/W2131711534","https://openalex.org/W642007152","https://openalex.org/W2341426843","https://openalex.org/W1583620810","https://openalex.org/W4387712795","https://openalex.org/W2355899496","https://openalex.org/W2184127972","https://openalex.org/W2110852049","https://openalex.org/W2218471654"],"abstract_inverted_index":{"Neural":[0],"audio":[1],"codec":[2,26,41,54,61,81,109,121],"tokens":[3,42,82],"serve":[4],"as":[5],"the":[6,25,29,34,64],"fundamental":[7],"building":[8],"blocks":[9],"for":[10,46,52,125],"speech":[11,15,30,47,89,106,115,127,132],"language":[12],"model":[13],"(SLM)-based":[14],"generation.":[16],"However,":[17],"there":[18],"is":[19,123],"no":[20],"systematic":[21],"understanding":[22],"on":[23,63,136],"how":[24],"system":[27,91],"affects":[28],"generation":[31,48,90,116],"performance":[32,74],"of":[33],"SLM.":[35,118],"In":[36],"this":[37],"work,":[38],"we":[39],"examine":[40],"within":[43],"SLM":[44,85],"framework":[45],"to":[49,71],"provide":[50],"insights":[51],"effective":[53],"design.":[55],"We":[56,79],"retrain":[57],"existing":[58],"high-performing":[59],"neural":[60],"models":[62],"same":[65],"data":[66],"set":[67],"and":[68,92],"loss":[69],"functions":[70],"compare":[72],"their":[73],"in":[75,108,117,129],"a":[76],"uniform":[77],"setting.":[78],"integrate":[80],"into":[83],"two":[84],"systems:":[86],"masked-based":[87],"parallel":[88],"an":[93],"auto-regressive":[94],"(AR)":[95],"plus":[96],"non-auto-regressive":[97],"(NAR)":[98],"model-based":[99],"system.":[100],"Our":[101],"findings":[102],"indicate":[103],"that":[104],"better":[105],"reconstruction":[107],"systems":[110],"does":[111],"not":[112],"guarantee":[113],"improved":[114],"A":[119],"high-quality":[120],"decoder":[122],"crucial":[124],"natural":[126],"production":[128],"SLM,":[130],"while":[131],"intelligibility":[133],"depends":[134],"more":[135],"quantization":[137],"mechanism.":[138]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
