{"id":"https://openalex.org/W4402705930","doi":"https://doi.org/10.1145/3664647.3681680","title":"VoxInstruct: Expressive Human Instruction-to-Speech Generation with Unified Multilingual Codec Language Modelling","display_name":"VoxInstruct: Expressive Human Instruction-to-Speech Generation with Unified Multilingual Codec Language Modelling","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4402705930","doi":"https://doi.org/10.1145/3664647.3681680"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681680","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681680","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3664647.3681680","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100749226","display_name":"Yixuan Zhou","orcid":"https://orcid.org/0009-0002-6363-891X"},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yixuan Zhou","raw_affiliation_strings":["Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018252335","display_name":"Xiaoyu Qin","orcid":"https://orcid.org/0000-0002-9720-3220"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyu Qin","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033137691","display_name":"Zeyu Jin","orcid":"https://orcid.org/0000-0001-8465-8878"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zeyu Jin","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111291993","display_name":"Shuoyi Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuoyi Zhou","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China","Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103014224","display_name":"Shun Lei","orcid":"https://orcid.org/0000-0003-3597-3913"},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shun Lei","raw_affiliation_strings":["Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026277472","display_name":"Songtao Zhou","orcid":"https://orcid.org/0009-0008-5972-3955"},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Songtao Zhou","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China","Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102869280","display_name":"Zhiyong Wu","orcid":"https://orcid.org/0000-0001-8533-0524"},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyong Wu","raw_affiliation_strings":["Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5039477812","display_name":"Jia Jia","orcid":"https://orcid.org/0009-0005-8449-278X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jia Jia","raw_affiliation_strings":["BNRist, Tsinghua University &amp; Key Laboratory of Pervasive Computing, Ministry of Education, Beijing, China"],"affiliations":[{"raw_affiliation_string":"BNRist, Tsinghua University &amp; Key Laboratory of Pervasive Computing, Ministry of Education, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5100749226"],"corresponding_institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":1.6963,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.86837079,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"554","last_page":"563"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.9033374786376953},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5523855090141296},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.49808669090270996},{"id":"https://openalex.org/keywords/speech-analytics","display_name":"Speech analytics","score":0.4927830994129181},{"id":"https://openalex.org/keywords/speech-technology","display_name":"Speech technology","score":0.47776472568511963},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4658653736114502},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4563804268836975},{"id":"https://openalex.org/keywords/speech-corpus","display_name":"Speech corpus","score":0.42454618215560913},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4192529320716858},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4143558144569397}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9033374786376953},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5523855090141296},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.49808669090270996},{"id":"https://openalex.org/C54953205","wikidata":"https://www.wikidata.org/wiki/Q4142201","display_name":"Speech analytics","level":4,"score":0.4927830994129181},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.47776472568511963},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4658653736114502},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4563804268836975},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.42454618215560913},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4192529320716858},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4143558144569397}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3664647.3681680","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681680","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2408.15676","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.15676","pdf_url":"https://arxiv.org/pdf/2408.15676","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3664647.3681680","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681680","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.75,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1231421488","display_name":null,"funder_award_id":"under","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1888327543","display_name":null,"funder_award_id":"15001","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2603945996","display_name":null,"funder_award_id":"62076144","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2981938667","display_name":null,"funder_award_id":"Shenzhen","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5249178904","display_name":null,"funder_award_id":"Grant No. 6","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5898128861","display_name":null,"funder_award_id":"61405150","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5939423041","display_name":null,"funder_award_id":"Technology","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7726157001","display_name":null,"funder_award_id":"Grant No.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1583837637","https://openalex.org/W3169483174","https://openalex.org/W3198694222","https://openalex.org/W3203407300","https://openalex.org/W3209059054","https://openalex.org/W4281758439","https://openalex.org/W4313021454","https://openalex.org/W4388233464","https://openalex.org/W4388979610","https://openalex.org/W4390075359","https://openalex.org/W4392904245","https://openalex.org/W4392908903","https://openalex.org/W4398152753","https://openalex.org/W4402703639"],"related_works":["https://openalex.org/W4200068392","https://openalex.org/W2537969829","https://openalex.org/W2550171623","https://openalex.org/W2500421879","https://openalex.org/W596245619","https://openalex.org/W3107456284","https://openalex.org/W2184371793","https://openalex.org/W4388404911","https://openalex.org/W2105439218","https://openalex.org/W2066051122"],"abstract_inverted_index":{"Recent":[0],"AIGC":[1,74],"systems":[2],"possess":[3],"the":[4,41,77,91,95,133,142,151,156,192,207],"capability":[5],"to":[6,26,31,85,97,153],"generate":[7],"digital":[8],"multimedia":[9],"content":[10,46,157],"based":[11],"on":[12],"human":[13,32,59,127,136,196,216],"language":[14,116,188],"instructions,":[15,164],"such":[16],"as":[17,170],"text,":[18],"image":[19],"and":[20,49,53,68,140,202,214,229],"video.":[21],"However,":[22],"when":[23],"it":[24],"comes":[25],"speech,":[27],"existing":[28],"methods":[29],"related":[30],"instruction-to-speech":[33,128],"generation":[34,139,144],"exhibit":[35],"two":[36],"limitations.":[37],"Firstly,":[38],"they":[39],"require":[40],"division":[42,62],"of":[43,56,79,135,158,210],"inputs":[44],"into":[45,124,185],"prompt":[47,51,84,213],"(transcript)":[48],"description":[50,83],"(style":[52],"speaker),":[54],"instead":[55],"directly":[57],"supporting":[58],"instruction.":[60],"This":[61],"is":[63,223],"less":[64],"natural":[65],"in":[66],"form":[67],"does":[69],"not":[70],"align":[71],"with":[72,146],"other":[73,147],"models.":[75],"Secondly,":[76],"practice":[78],"utilizing":[80],"an":[81,171],"independent":[82],"model":[86,152,200],"speech":[87,99,138,143,160,167,194,212,220],"style,":[88],"without":[89],"considering":[90],"transcript":[92],"content,":[93],"restricts":[94],"ability":[96],"control":[98],"at":[100],"a":[101,111,125,224],"fine-grained":[102],"level.":[103],"To":[104,149],"address":[105],"these":[106],"limitations,":[107],"we":[108,165],"propose":[109],"VoxInstruct,":[110],"novel":[112],"unified":[113],"multilingual":[114],"codec":[115,187],"modeling":[117],"framework":[118],"that":[119],"extends":[120],"traditional":[121],"text-to-speech":[122],"tasks":[123],"general":[126],"task.":[129],"Our":[130],"approach":[131],"enhances":[132],"expressiveness":[134],"instruction-guided":[137],"aligns":[141],"paradigm":[145],"modalities.":[148],"enable":[150],"automatically":[154],"extract":[155],"synthesized":[159],"from":[161],"raw":[162],"text":[163],"introduce":[166],"semantic":[168],"tokens":[169],"intermediate":[172],"representation":[173],"for":[174,206,218],"instruction-to-content":[175],"guidance.":[176],"We":[177],"also":[178],"incorporate":[179],"multiple":[180],"Classifier-Free":[181],"Guidance":[182],"(CFG)":[183],"strategies":[184,204],"our":[186,199],"model,":[189],"which":[190,222],"strengthens":[191],"generated":[193],"following":[195],"instructions.":[197],"Furthermore,":[198],"architecture":[201],"training":[203],"allow":[205],"simultaneous":[208],"support":[209],"combining":[211],"descriptive":[215],"instruction":[217],"expressive":[219],"synthesis,":[221],"first-of-its-kind":[225],"attempt.":[226],"Codes,":[227],"models":[228],"demos":[230],"are":[231],"at:":[232],"https://github.com/thuhcsi/VoxInstruct.":[233]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":1}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2025-10-10T00:00:00"}
