{"id":"https://openalex.org/W4408354809","doi":"https://doi.org/10.1109/icassp49660.2025.10890328","title":"Faster Speech-LLaMA Inference with Multi-token Prediction","display_name":"Faster Speech-LLaMA Inference with Multi-token Prediction","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408354809","doi":"https://doi.org/10.1109/icassp49660.2025.10890328"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890328","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890328","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5004777817","display_name":"Desh Raj","orcid":"https://orcid.org/0000-0002-5038-9400"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Desh Raj","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048538280","display_name":"Gil Keren","orcid":"https://orcid.org/0000-0002-5153-3494"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gil Keren","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113970008","display_name":"Junteng Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junteng Jia","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074237839","display_name":"Jay Mahadeokar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jay Mahadeokar","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066166549","display_name":"Ozlem Kalinli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ozlem Kalinli","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5004777817"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":5.7199,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.94915985,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9778000116348267,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9728000164031982,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7650622129440308},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7561393976211548},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6764122247695923},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4755798876285553},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.41707301139831543},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.12063369154930115}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7650622129440308},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7561393976211548},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6764122247695923},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4755798876285553},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41707301139831543},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.12063369154930115}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890328","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890328","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2327501763","https://openalex.org/W2769810959","https://openalex.org/W3095410713","https://openalex.org/W3119308075","https://openalex.org/W4389518760","https://openalex.org/W4392903872","https://openalex.org/W4392903956","https://openalex.org/W4392910583","https://openalex.org/W4392931626","https://openalex.org/W4401042284","https://openalex.org/W6638749077","https://openalex.org/W6748409065","https://openalex.org/W6754244489","https://openalex.org/W6770820644","https://openalex.org/W6771467084","https://openalex.org/W6842258392","https://openalex.org/W6846659131","https://openalex.org/W6847386241","https://openalex.org/W6847478871","https://openalex.org/W6849530321","https://openalex.org/W6857054612","https://openalex.org/W6860434696","https://openalex.org/W6861533439","https://openalex.org/W6866506555"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4288261899","https://openalex.org/W4307309205","https://openalex.org/W2967478618","https://openalex.org/W4385009901","https://openalex.org/W4385572700"],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,64,131],"(LLMs)":[3],"have":[4],"become":[5],"proficient":[6],"at":[7],"solving":[8],"a":[9,28,110,133],"wide":[10],"variety":[11,134],"of":[12,55,135,143],"tasks,":[13],"including":[14],"those":[15],"involving":[16],"multi-modal":[17],"inputs.":[18],"In":[19,70],"particular,":[20],"instantiating":[21],"an":[22],"LLM":[23],"(such":[24],"as":[25],"LLaMA)":[26],"with":[27],"speech":[29,38],"encoder":[30],"and":[31,58,97,103],"training":[32,124],"it":[33],"on":[34,132],"paired":[35],"data":[36],"imparts":[37],"recognition":[39],"(ASR)":[40],"abilities":[41],"to":[42,51,75],"the":[43,52,59,85,141],"decoder-only":[44],"model,":[45],"hence":[46],"called":[47],"Speech-LLaMA.":[48],"Nevertheless,":[49],"due":[50],"sequential":[53],"nature":[54],"auto-regressive":[56],"inference":[57,68,79,105],"relatively":[60,66],"large":[61],"decoder,":[62],"Speech-LLaMA":[63,78],"require":[65],"high":[67],"time.":[69],"this":[71],"work,":[72],"we":[73],"propose":[74,109],"speed":[76],"up":[77],"by":[80,146],"predicting":[81],"multiple":[82],"tokens":[83],"in":[84],"same":[86],"decoding":[87,114],"step.":[88],"We":[89,107,128],"explore":[90],"several":[91],"model":[92],"architectures":[93],"that":[94,116],"enable":[95],"this,":[96],"investigate":[98],"their":[99],"performance":[100],"using":[101],"threshold-based":[102],"verification-based":[104],"strategies.":[106],"also":[108],"prefix-based":[111],"beam":[112],"search":[113],"method":[115],"allows":[117],"efficient":[118],"minimum":[119],"word":[120],"error":[121],"rate":[122],"(MWER)":[123],"for":[125],"such":[126],"models.":[127],"evaluate":[129],"our":[130],"public":[136],"benchmarks,":[137],"where":[138],"they":[139],"reduce":[140],"number":[142],"decoder":[144],"calls":[145],"\u223c3.2x":[147],"while":[148],"maintaining":[149],"or":[150],"improving":[151],"WER":[152],"performance.":[153]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-12-28T23:10:05.387466","created_date":"2025-10-10T00:00:00"}
