{"id":"https://openalex.org/W4401070302","doi":"https://doi.org/10.1109/taslp.2024.3434425","title":"VioLA: Conditional Language Models for Speech Recognition, Synthesis, and Translation","display_name":"VioLA: Conditional Language Models for Speech Recognition, Synthesis, and Translation","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4401070302","doi":"https://doi.org/10.1109/taslp.2024.3434425"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3434425","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3434425","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106558173","display_name":"Tianrui Wang","orcid":"https://orcid.org/0000-0002-2765-5889"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Tianrui Wang","raw_affiliation_strings":["Institute of Information Science and the Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing Jiaotong University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Information Science and the Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106557565","display_name":"Long Zhou","orcid":"https://orcid.org/0009-0006-1919-4943"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Zhou","raw_affiliation_strings":["Microsoft Research Aisa, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Aisa, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101577318","display_name":"Ziqiang Zhang","orcid":"https://orcid.org/0000-0003-0110-1543"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziqiang Zhang","raw_affiliation_strings":["National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100324098","display_name":"Yu Wu","orcid":"https://orcid.org/0000-0002-1680-8253"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Wu","raw_affiliation_strings":["Microsoft Research Aisa, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Aisa, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101635405","display_name":"Shujie Liu","orcid":"https://orcid.org/0009-0008-0785-8882"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shujie Liu","raw_affiliation_strings":["Microsoft Research Aisa, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Aisa, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034136587","display_name":"Yashesh Gaur","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yashesh Gaur","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108961732","display_name":"Zhuo Chen","orcid":"https://orcid.org/0009-0007-3882-3810"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhuo Chen","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365053","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-1089-9748"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft Corporation, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5014662947","display_name":"Furu Wei","orcid":"https://orcid.org/0000-0002-7810-5852"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Furu Wei","raw_affiliation_strings":["Microsoft Research Aisa, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Aisa, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5106558173"],"corresponding_institution_ids":["https://openalex.org/I21193070"],"apc_list":null,"apc_paid":null,"fwci":5.168,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.95978773,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":"32","issue":null,"first_page":"3709","last_page":"3716"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.6282713413238525},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6113629341125488},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5739414691925049},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5300628542900085},{"id":"https://openalex.org/keywords/viola","display_name":"Viola","score":0.4955238103866577},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4761655032634735},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.43861281871795654},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.12515252828598022},{"id":"https://openalex.org/keywords/history","display_name":"History","score":0.08039790391921997}],"concepts":[{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.6282713413238525},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6113629341125488},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5739414691925049},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5300628542900085},{"id":"https://openalex.org/C68615497","wikidata":"https://www.wikidata.org/wiki/Q80284","display_name":"Viola","level":3,"score":0.4955238103866577},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4761655032634735},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.43861281871795654},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.12515252828598022},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.08039790391921997},{"id":"https://openalex.org/C124086623","wikidata":"https://www.wikidata.org/wiki/Q5994","display_name":"Piano","level":2,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C105580179","wikidata":"https://www.wikidata.org/wiki/Q188928","display_name":"Messenger RNA","level":3,"score":0.0},{"id":"https://openalex.org/C52119013","wikidata":"https://www.wikidata.org/wiki/Q50637","display_name":"Art history","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3434425","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3434425","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320333993","display_name":"Microsoft Research Asia","ror":"https://ror.org/0300m5276"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":56,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1572989473","https://openalex.org/W1995562189","https://openalex.org/W2101105183","https://openalex.org/W2127141656","https://openalex.org/W2130942839","https://openalex.org/W2136545725","https://openalex.org/W2903739847","https://openalex.org/W2936078256","https://openalex.org/W2946200149","https://openalex.org/W2962784628","https://openalex.org/W2962824709","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2997540646","https://openalex.org/W2998386507","https://openalex.org/W3034999214","https://openalex.org/W3095410713","https://openalex.org/W3097206152","https://openalex.org/W3118974591","https://openalex.org/W3161873870","https://openalex.org/W3161940574","https://openalex.org/W3163573274","https://openalex.org/W3203407300","https://openalex.org/W3205644108","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3211278025","https://openalex.org/W4288089799","https://openalex.org/W4292779060","https://openalex.org/W4307323391","https://openalex.org/W4313679638","https://openalex.org/W4323651091","https://openalex.org/W4381786045","https://openalex.org/W4381827575","https://openalex.org/W4385245566","https://openalex.org/W4386071687","https://openalex.org/W4388017359","https://openalex.org/W4389524500","https://openalex.org/W4390528611","https://openalex.org/W4391021623","https://openalex.org/W6634186343","https://openalex.org/W6679436768","https://openalex.org/W6739901393","https://openalex.org/W6755207826","https://openalex.org/W6763832098","https://openalex.org/W6769627184","https://openalex.org/W6770506093","https://openalex.org/W6778823374","https://openalex.org/W6778883912","https://openalex.org/W6795200824","https://openalex.org/W6847363464","https://openalex.org/W6848735303","https://openalex.org/W6850334629","https://openalex.org/W6853998256","https://openalex.org/W6860320428"],"related_works":["https://openalex.org/W576603366","https://openalex.org/W359291820","https://openalex.org/W2766795047","https://openalex.org/W2598805484","https://openalex.org/W373995369","https://openalex.org/W2588998998","https://openalex.org/W2144494989","https://openalex.org/W610432525","https://openalex.org/W186163871","https://openalex.org/W623496018"],"abstract_inverted_index":{"Recent":[0],"research":[1],"shows":[2],"a":[3,28,52,86,159],"big":[4],"convergence":[5],"in":[6],"model":[7,55,124,145,157],"architecture,":[8],"training":[9],"objectives,":[10],"and":[11,41,48,117,134,150,154,161],"inference":[12],"methods":[13],"across":[14],"various":[15,36],"tasks":[16,38,90,152],"for":[17],"different":[18,132],"modalities.":[19],"In":[20,84],"this":[21],"paper,":[22],"we":[23,64],"propose":[24],"<sc":[25,142],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[26,143],"xmlns:xlink=\"http://www.w3.org/1999/xlink\"><b>VioLA</b></small>,":[27],"single":[29],"auto-regressive":[30],"Transformer":[31],"decoder-only":[32,156],"network":[33],"that":[34,139],"unifies":[35],"cross-modal":[37,151],"involving":[39],"speech":[40,68],"text,":[42],"such":[43,85],"as":[44,51],"speech-to-text,":[45],"text-to-text,":[46],"text-to-speech,":[47],"speech-to-speech":[49],"tasks,":[50],"conditional":[53,105],"language":[54,106,114],"task":[56,111],"via":[57],"multi-task":[58],"learning":[59],"framework.":[60],"To":[61],"accomplish":[62],"this,":[63],"first":[65],"convert":[66],"the":[67,75,122,127,140,155,166],"utterances":[69],"to":[70,74,93,125],"discrete":[71],"tokens":[72],"(similar":[73],"textual":[76],"data)":[77],"using":[78],"an":[79],"offline":[80],"neural":[81],"codec":[82],"encoder.":[83],"way,":[87],"all":[88],"these":[89],"are":[91],"converted":[92],"token-based":[94],"sequence":[95],"prediction":[96],"problems,":[97],"which":[98],"can":[99,146],"be":[100],"naturally":[101],"handled":[102],"with":[103],"one":[104],"model.":[107],"We":[108],"further":[109],"integrate":[110],"IDs":[112,115],"(TID),":[113],"(LID),":[116],"LSTM-based":[118],"acoustic":[119],"embedding":[120],"into":[121],"proposed":[123,141],"enhance":[126],"modeling":[128],"capability":[129],"of":[130],"handling":[131],"languages":[133],"tasks.":[135],"Experimental":[136],"results":[137],"demonstrate":[138],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">VioLA</small>":[144],"support":[147],"both":[148],"single-modal":[149],"well,":[153],"achieves":[158],"comparable":[160],"even":[162],"better":[163],"performance":[164],"than":[165],"strong":[167],"baselines.":[168]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
