{"id":"https://openalex.org/W4408355621","doi":"https://doi.org/10.1109/icassp49660.2025.10890482","title":"M-BEST-RQ: A Multi-Channel Speech Foundation Model for Smart Glasses","display_name":"M-BEST-RQ: A Multi-Channel Speech Foundation Model for Smart Glasses","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408355621","doi":"https://doi.org/10.1109/icassp49660.2025.10890482"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890482","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890482","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100730499","display_name":"Yufeng Yang","orcid":"https://orcid.org/0000-0003-0780-2703"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yufeng Yang","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004777817","display_name":"Desh Raj","orcid":"https://orcid.org/0000-0002-5038-9400"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Desh Raj","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102074223","display_name":"Ju Lin","orcid":"https://orcid.org/0009-0006-7448-9752"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ju Lin","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071286568","display_name":"Niko Moritz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niko Moritz","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113970008","display_name":"Junteng Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junteng Jia","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048538280","display_name":"Gil Keren","orcid":"https://orcid.org/0000-0002-5153-3494"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gil Keren","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045428440","display_name":"Egor Lakomkin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Egor Lakomkin","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102111886","display_name":"Yiteng Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yiteng Huang","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055065049","display_name":"Jacob Donley","orcid":"https://orcid.org/0000-0002-8401-798X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jacob Donley","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074237839","display_name":"Jay Mahadeokar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jay Mahadeokar","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066166549","display_name":"Ozlem Kalinli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ozlem Kalinli","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5100730499"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.5285,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.79049204,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10789","display_name":"Interactive and Immersive Displays","score":0.9648000001907349,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10789","display_name":"Interactive and Immersive Displays","score":0.9648000001907349,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10914","display_name":"Tactile and Sensory Interactions","score":0.9228000044822693,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.7332560420036316},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5699900984764099},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5341593027114868},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.42469435930252075},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.228580504655838},{"id":"https://openalex.org/keywords/history","display_name":"History","score":0.09297254681587219}],"concepts":[{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.7332560420036316},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5699900984764099},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5341593027114868},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.42469435930252075},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.228580504655838},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.09297254681587219},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890482","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890482","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2799473636","https://openalex.org/W2936774411","https://openalex.org/W2973049979","https://openalex.org/W2995181338","https://openalex.org/W3094821064","https://openalex.org/W3097777922","https://openalex.org/W3160207687","https://openalex.org/W3197580070","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4281492411","https://openalex.org/W4312356258","https://openalex.org/W4385822839","https://openalex.org/W4386269370","https://openalex.org/W4392902568","https://openalex.org/W4392904458","https://openalex.org/W4392909605","https://openalex.org/W4393152829","https://openalex.org/W4401609151","https://openalex.org/W4403640425","https://openalex.org/W4403998251","https://openalex.org/W6637373629","https://openalex.org/W6780218876","https://openalex.org/W6797761920","https://openalex.org/W6800751262","https://openalex.org/W6810673746","https://openalex.org/W6856983930","https://openalex.org/W6857331512"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2381393187","https://openalex.org/W2332779545","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W2358060160","https://openalex.org/W2035483685"],"abstract_inverted_index":{"The":[0],"growing":[1],"popularity":[2],"of":[3,16,44,92,157,173,180],"multi-channel":[4,55,80],"wearable":[5],"devices,":[6],"such":[7,18],"as":[8,19],"smart":[9,60],"glasses,":[10,61],"has":[11],"led":[12],"to":[13,29,65,96,137],"a":[14,90,131,163],"surge":[15],"applications":[17],"targeted":[20],"speech":[21,56,81,104],"recognition":[22,105],"and":[23,112,125],"enhanced":[24],"hearing.":[25],"However,":[26],"current":[27],"approaches":[28],"solve":[30],"these":[31],"tasks":[32,95],"use":[33],"independently":[34],"trained":[35,169],"models,":[36],"which":[37,62,119,176],"may":[38],"not":[39],"benefit":[40],"from":[41,122],"large":[42],"amounts":[43],"unlabeled":[45],"data.":[46],"In":[47],"this":[48],"paper,":[49],"we":[50,88],"propose":[51],"M-BEST-RQ,":[52],"the":[53,123,147,178],"first":[54],"foundation":[57],"model":[58,161],"for":[59],"is":[63,135,168],"designed":[64],"leverage":[66],"large-scale":[67],"self-supervised":[68],"learning":[69],"(SSL)":[70],"in":[71,151],"an":[72],"array-geometry":[73],"agnostic":[74],"approach.":[75,182],"While":[76],"prior":[77],"work":[78],"on":[79,85,170],"SSL":[82],"only":[83,154],"evaluated":[84],"simulated":[86],"settings,":[87],"curate":[89],"suite":[91],"real":[93],"downstream":[94],"evaluate":[97],"our":[98,160,181],"model,":[99],"namely":[100],"(i)":[101],"conversational":[102,148],"automatic":[103],"(ASR),":[106],"(ii)":[107],"spherical":[108],"active":[109],"source":[110],"localization,":[111],"(iii)":[113],"glasses":[114],"wearer":[115],"voice":[116],"activity":[117],"detection,":[118],"are":[120],"sourced":[121],"MMCSG":[124],"EasyCom":[126],"datasets.":[127],"We":[128],"show":[129],"that":[130,167],"general-purpose":[132],"M-BEST-RQ":[133],"encoder":[134],"able":[136],"match":[138],"or":[139],"surpass":[140],"supervised":[141,164],"models":[142],"across":[143],"all":[144],"tasks.":[145],"For":[146],"ASR":[149,165],"task":[150],"particular,":[152],"using":[153],"8":[155],"hours":[156,172],"labeled":[158,174],"speech,":[159],"outperforms":[162],"baseline":[166],"2000":[171],"data,":[175],"demonstrates":[177],"effectiveness":[179]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
