{"id":"https://openalex.org/W7148392059","doi":"https://doi.org/10.1109/asru65441.2025.11434651","title":"Granite-speech: open-source speech-aware LLMs with strong English ASR capabilities","display_name":"Granite-speech: open-source speech-aware LLMs with strong English ASR capabilities","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148392059","doi":"https://doi.org/10.1109/asru65441.2025.11434651"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434651","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434651","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079994647","display_name":"George Saon","orcid":"https://orcid.org/0009-0004-6837-5009"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"George Saon","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055632925","display_name":"Avihu Dekel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Avihu Dekel","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103531189","display_name":"Alexander Brooks","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Alexander Brooks","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069028016","display_name":"Tohru Nagano","orcid":"https://orcid.org/0000-0002-3686-2791"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tohru Nagano","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129713573","display_name":"Abraham Daniels","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abraham Daniels","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089929977","display_name":"Aharon Satt","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aharon Satt","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132830439","display_name":"Ashish Mittal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ashish Mittal","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132814469","display_name":"Brian Kingsbury","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brian Kingsbury","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049318367","display_name":"David Haws","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"David Haws","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132802968","display_name":"Edmilson Morais","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Edmilson Morais","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021294422","display_name":"Gakuto Kurata","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gakuto Kurata","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021900623","display_name":"Hagai Aronowitz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hagai Aronowitz","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132828569","display_name":"Ibrahim Ibrahim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ibrahim Ibrahim","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132792873","display_name":"Jeff Kuo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jeff Kuo","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132791818","display_name":"Kate Soule","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kate Soule","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023268703","display_name":"Luis Lastras","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luis Lastras","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132823707","display_name":"Masayuki Suzuki","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Masayuki Suzuki","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132823496","display_name":"Ron Hoory","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ron Hoory","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132789909","display_name":"Samuel Thomas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Samuel Thomas","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048192694","display_name":"Sashi Novitasari","orcid":"https://orcid.org/0000-0001-7467-5682"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sashi Novitasari","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132794513","display_name":"Takashi Fukuda","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Takashi Fukuda","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035448968","display_name":"Vishal Sunder","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vishal Sunder","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132793985","display_name":"Xiaodong Cui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaodong Cui","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5054462082","display_name":"Zvi Kons","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zvi Kons","raw_affiliation_strings":["IBM Research"],"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":24,"corresponding_author_ids":["https://openalex.org/A5079994647"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":10.9093,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.9819751,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8086000084877014,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8086000084877014,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.05400000140070915,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.0502999983727932,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/connectionism","display_name":"Connectionism","score":0.5134999752044678},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.48730000853538513},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.46619999408721924},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.3910999894142151},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.3813000023365021},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.36880001425743103},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.3668000102043152},{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.35409998893737793},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.3529999852180481}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6958000063896179},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6510000228881836},{"id":"https://openalex.org/C8521452","wikidata":"https://www.wikidata.org/wiki/Q203790","display_name":"Connectionism","level":3,"score":0.5134999752044678},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.48730000853538513},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.46619999408721924},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4551999866962433},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3953999876976013},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.3910999894142151},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.3813000023365021},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.36880001425743103},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3668000102043152},{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.35409998893737793},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.3529999852180481},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.3425000011920929},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.32499998807907104},{"id":"https://openalex.org/C2778880076","wikidata":"https://www.wikidata.org/wiki/Q750553","display_name":"Brazilian Portuguese","level":3,"score":0.30559998750686646},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.2919999957084656},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.28610000014305115},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.2847999930381775},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.28380000591278076},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2818000018596649},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C2777526511","wikidata":"https://www.wikidata.org/wiki/Q691543","display_name":"Pace","level":2,"score":0.2694999873638153},{"id":"https://openalex.org/C110384440","wikidata":"https://www.wikidata.org/wiki/Q1143270","display_name":"Upsampling","level":3,"score":0.26840001344680786},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C99138194","wikidata":"https://www.wikidata.org/wiki/Q183427","display_name":"Hash function","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.25699999928474426},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.25450000166893005}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434651","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434651","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6011075973510742,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2099262442","https://openalex.org/W2936774411","https://openalex.org/W3095410713","https://openalex.org/W3097777922","https://openalex.org/W3100806282","https://openalex.org/W3119308075","https://openalex.org/W3123340107","https://openalex.org/W3196509775","https://openalex.org/W3197140813","https://openalex.org/W3198587774","https://openalex.org/W3198694222","https://openalex.org/W4285210452","https://openalex.org/W4319862635","https://openalex.org/W4391021698","https://openalex.org/W4402670286","https://openalex.org/W4412171133","https://openalex.org/W7133200369","https://openalex.org/W7133224126"],"related_works":[],"abstract_inverted_index":{"Granite-speech":[0],"LLMs":[1],"are":[2,125],"compact":[3],"and":[4,16,58,70,83,105,114,152,154,159],"efficient":[5],"speech":[6,18,30,93,156],"language":[7],"models":[8,22,47,124,141],"specifically":[9],"designed":[10],"for":[11,65],"English":[12,39],"ASR<sup":[13],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[14,130,138,169],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[15],"automatic":[17],"translation":[19,157],"(AST).":[20],"The":[21,72,123],"were":[23,49],"trained":[24,50,85],"by":[25],"modality":[26,94],"aligning":[27],"granite-3.3-instruct":[28],"to":[29,97,108,117,158],"on":[31,38,51,62,128],"publicly":[32],"available":[33,127],"open-source":[34],"corpora.":[35],"Comprehensive":[36],"benchmarking":[37],"ASR":[40,146],"shows":[41],"that":[42,48],"they":[43,59],"outperform":[44],"several":[45],"competitors\u2019":[46],"orders":[52],"of":[53,101],"magnitude":[54],"more":[55],"proprietary":[56],"data,":[57],"keep":[60],"pace":[61],"English-to-X":[63],"AST":[64],"major":[66],"European":[67],"languages,":[68],"Japanese,":[69],"Mandarin.":[71],"speech-specific":[73],"components":[74],"are:":[75],"a":[76,90,133],"conformer":[77],"acoustic":[78,103],"encoder":[79],"using":[80],"block":[81],"attention":[82],"self-conditioning":[84],"with":[86],"connectionist":[87],"temporal":[88,99],"classification,":[89],"windowed":[91],"query-transformer":[92],"adapter":[95],"used":[96],"do":[98],"downsampling":[100],"the":[102,109,120,165],"embeddings":[104],"map":[106],"them":[107],"LLM":[110],"text":[111,121],"embedding":[112],"space,":[113],"LoRA":[115],"adapters":[116],"further":[118],"fine-tune":[119],"LLM.":[122],"freely":[126],"HuggingFace<sup":[129],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>":[131],"under":[132],"permissive":[134],"Apache":[135],"2.0":[136],"license.<sup":[137],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>The":[139],"latest":[140],"(revision":[142],"3.3.2)":[143],"support":[144],"multilingual":[145],"in":[147],"English,":[148],"French,":[149],"German,":[150],"Spanish":[151],"Portuguese":[153],"bidirectional":[155],"from":[160],"English.":[161],"This":[162],"paper":[163],"covers":[164],"initial":[166],"English-only":[167],"release.<sup":[168],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>https://huggingface.co/ibm-granite/granite-speech-3.3-2b":[170],"(and\u2026-8b).":[171]},"counts_by_year":[{"year":2026,"cited_by_count":5}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2026-04-03T00:00:00"}
