{"id":"https://openalex.org/W4408355459","doi":"https://doi.org/10.1109/icassp49660.2025.10888249","title":"NEST: Self-supervised Fast Conformer as All-purpose Seasoning to Speech Processing Tasks","display_name":"NEST: Self-supervised Fast Conformer as All-purpose Seasoning to Speech Processing Tasks","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408355459","doi":"https://doi.org/10.1109/icassp49660.2025.10888249"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10888249","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888249","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100411301","display_name":"He Huang","orcid":"https://orcid.org/0000-0002-9217-4977"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"He Huang","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101511624","display_name":"Taejin Park","orcid":"https://orcid.org/0000-0003-2040-5884"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Taejin Park","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073181369","display_name":"Kunal Dhawan","orcid":"https://orcid.org/0000-0002-5276-2475"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kunal Dhawan","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022587932","display_name":"Ivan Medennikov","orcid":"https://orcid.org/0000-0001-5381-3433"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ivan Medennikov","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069691915","display_name":"Krishna C. Puvvada","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Krishna C. Puvvada","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040007143","display_name":"Nithin Rao Koluguri","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nithin Rao Koluguri","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100372965","display_name":"Weiqing Wang","orcid":"https://orcid.org/0000-0002-9578-819X"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Weiqing Wang","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040747392","display_name":"Jagadeesh Balam","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jagadeesh Balam","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032957280","display_name":"Boris Ginsburg","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Boris Ginsburg","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5100411301"],"corresponding_institution_ids":["https://openalex.org/I4210127875"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.02086832,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.43149998784065247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.43149998784065247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/seasoning","display_name":"Seasoning","score":0.8292134404182434},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6411479115486145},{"id":"https://openalex.org/keywords/nest","display_name":"Nest (protein structural motif)","score":0.5108139514923096},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4251052141189575},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.42389115691185},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3936768174171448},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3752520680427551},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.13778525590896606},{"id":"https://openalex.org/keywords/ecology","display_name":"Ecology","score":0.11992660164833069}],"concepts":[{"id":"https://openalex.org/C2777810110","wikidata":"https://www.wikidata.org/wiki/Q7441925","display_name":"Seasoning","level":3,"score":0.8292134404182434},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6411479115486145},{"id":"https://openalex.org/C152630561","wikidata":"https://www.wikidata.org/wiki/Q6997750","display_name":"Nest (protein structural motif)","level":2,"score":0.5108139514923096},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4251052141189575},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.42389115691185},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3936768174171448},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3752520680427551},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.13778525590896606},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.11992660164833069},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C206139338","wikidata":"https://www.wikidata.org/wiki/Q192355","display_name":"Raw material","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10888249","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888249","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Zero hunger","score":0.41999998688697815,"id":"https://metadata.un.org/sdg/2"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":55,"referenced_works":["https://openalex.org/W123007118","https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2167768673","https://openalex.org/W2752796333","https://openalex.org/W2973072704","https://openalex.org/W2995181338","https://openalex.org/W3008357631","https://openalex.org/W3015698636","https://openalex.org/W3024869864","https://openalex.org/W3038871978","https://openalex.org/W3095410713","https://openalex.org/W3097777922","https://openalex.org/W3100460087","https://openalex.org/W3119308075","https://openalex.org/W3155162503","https://openalex.org/W3196117288","https://openalex.org/W3197411683","https://openalex.org/W3197580070","https://openalex.org/W3198587774","https://openalex.org/W3198608154","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3213029956","https://openalex.org/W3217767527","https://openalex.org/W4220731890","https://openalex.org/W4226033575","https://openalex.org/W4296069155","https://openalex.org/W4313156423","https://openalex.org/W4313316127","https://openalex.org/W4319862635","https://openalex.org/W4385822624","https://openalex.org/W4389315113","https://openalex.org/W4389600306","https://openalex.org/W4391021542","https://openalex.org/W4392902568","https://openalex.org/W4401609151","https://openalex.org/W4402111558","https://openalex.org/W4404782746","https://openalex.org/W6603931906","https://openalex.org/W6688816777","https://openalex.org/W6739901393","https://openalex.org/W6746278845","https://openalex.org/W6755207826","https://openalex.org/W6769196770","https://openalex.org/W6771467084","https://openalex.org/W6779429391","https://openalex.org/W6780218876","https://openalex.org/W6799174933","https://openalex.org/W6803378298","https://openalex.org/W6810007534","https://openalex.org/W6810673746","https://openalex.org/W6847363464","https://openalex.org/W6873771331","https://openalex.org/W6890361730"],"related_works":["https://openalex.org/W2981428355","https://openalex.org/W1834994814","https://openalex.org/W2041273198","https://openalex.org/W1599055764","https://openalex.org/W2131711534","https://openalex.org/W2149163000","https://openalex.org/W2962858469","https://openalex.org/W2289873871","https://openalex.org/W2559040841","https://openalex.org/W114661351"],"abstract_inverted_index":{"Self-supervised":[0],"learning":[1],"(SSL)":[2],"has":[3],"been":[4],"proved":[5],"to":[6,98],"benefit":[7],"a":[8,39,89,123],"wide":[9],"range":[10],"of":[11,26,73,125],"speech":[12,17,28,92,126,131],"processing":[13,127],"tasks,":[14,128],"such":[15,129],"as":[16,47,130],"recognition/translation,":[18,132],"speaker":[19,102,133],"verification":[20],"and":[21,41,84,117,140],"diarization,":[22,134],"etc.":[23,138],"However,":[24],"most":[25],"current":[27],"SSL":[29,44],"approaches":[30],"are":[31,142],"computationally":[32],"expensive.":[33],"In":[34],"this":[35],"paper,":[36],"we":[37,55,76],"introduce":[38],"simplified":[40],"more":[42],"efficient":[43],"framework,":[45],"termed":[46],"NeMo":[48,147],"Encoder":[49],"for":[50,81],"Speech":[51],"Tasks":[52],"(NEST).":[53],"Specifically,":[54],"adopt":[56],"the":[57,96,100],"FastConformer":[58],"architecture":[59],"with":[60],"8x":[61],"sub-sampling":[62],"rate,":[63],"which":[64],"is":[65],"faster":[66],"than":[67],"Transformer":[68],"or":[69,105],"Conformer":[70],"architectures.":[71],"Instead":[72],"clusteringbased":[74],"quantization,":[75],"use":[77],"fixed":[78],"random":[79],"projection":[80],"its":[82],"simplicity":[83],"effectiveness.":[85],"We":[86],"also":[87],"implement":[88],"generalized":[90],"noisy":[91],"augmentation":[93],"that":[94,110],"teaches":[95],"model":[97],"disentangle":[99],"main":[101],"from":[103],"noise":[104],"other":[106],"speakers.":[107],"Experiments":[108],"show":[109],"NEST":[111],"improves":[112],"over":[113],"existing":[114],"self-supervised":[115],"models":[116],"achieves":[118],"new":[119],"state-of-the-art":[120],"performance":[121],"on":[122],"variety":[124],"spoken":[135],"language":[136],"understanding,":[137],"Code":[139],"checkpoints":[141],"publicly":[143],"available":[144],"via":[145],"NVIDIA":[146],"framework<sup":[148],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[149,151,153],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup><sup":[150],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup><sup":[152],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">3</sup>.":[154]},"counts_by_year":[],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
