{"id":"https://openalex.org/W4415708492","doi":"https://doi.org/10.1109/icme59968.2025.11209883","title":"k2SSL: A Faster and Better Framework for Self-Supervised Speech Representation Learning","display_name":"k2SSL: A Faster and Better Framework for Self-Supervised Speech Representation Learning","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415708492","doi":"https://doi.org/10.1109/icme59968.2025.11209883"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11209883","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209883","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100743978","display_name":"Yifan Yang","orcid":"https://orcid.org/0000-0003-3313-5473"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yifan Yang","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110937314","display_name":"Jianheng Zhuo","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianheng Zhuo","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075023049","display_name":"Zengrui Jin","orcid":"https://orcid.org/0000-0002-2637-7880"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zengrui Jin","raw_affiliation_strings":["The Chinese University of Hong Kong,China"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong,China","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100438492","display_name":"Ziyang Ma","orcid":"https://orcid.org/0000-0002-0623-9114"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziyang Ma","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100435428","display_name":"Xiaoyu Yang","orcid":"https://orcid.org/0000-0002-3570-8979"},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyu Yang","raw_affiliation_strings":["Xiaomi Corporation,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corporation,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066643565","display_name":"Zengwei Yao","orcid":"https://orcid.org/0000-0002-2331-2387"},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zengwei Yao","raw_affiliation_strings":["Xiaomi Corporation,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corporation,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102707172","display_name":"Liyong Guo","orcid":"https://orcid.org/0009-0002-2465-3914"},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liyong Guo","raw_affiliation_strings":["Xiaomi Corporation,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corporation,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068307574","display_name":"Wei Kang","orcid":"https://orcid.org/0000-0003-0193-2654"},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Kang","raw_affiliation_strings":["Xiaomi Corporation,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corporation,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004231067","display_name":"Fangjun Kuang","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fangjun Kuang","raw_affiliation_strings":["Xiaomi Corporation,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corporation,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020006167","display_name":"Na Li","orcid":"https://orcid.org/0000-0003-3246-0039"},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Lin","raw_affiliation_strings":["Xiaomi Corporation,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corporation,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084286453","display_name":"Daniel Povey","orcid":"https://orcid.org/0000-0002-0611-3634"},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Daniel Povey","raw_affiliation_strings":["Xiaomi Corporation,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Corporation,Beijing,China","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100329117","display_name":"Xie Chen","orcid":"https://orcid.org/0000-0001-5801-2571"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xie Chen","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5100743978"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.17534206,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9452999830245972,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9452999830245972,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.00930000003427267,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.0044999998062849045,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.7184000015258789},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6327000260353088},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5809999704360962},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.49869999289512634},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.42640000581741333},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.415800005197525},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4009000062942505},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.36559998989105225}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8029999732971191},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.7184000015258789},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6327000260353088},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5809999704360962},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5149000287055969},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.49869999289512634},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45410001277923584},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.42640000581741333},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.41999998688697815},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.415800005197525},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4009000062942505},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.36559998989105225},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.3626999855041504},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3596999943256378},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.35659998655319214},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.35580000281333923},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.34610000252723694},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2888999879360199},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2838999927043915},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C77660490","wikidata":"https://www.wikidata.org/wiki/Q244916","display_name":"Intermediate language","level":3,"score":0.2513999938964844}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11209883","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209883","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2143612262","https://openalex.org/W2933138175","https://openalex.org/W2953190524","https://openalex.org/W2962780374","https://openalex.org/W2962784628","https://openalex.org/W2973049979","https://openalex.org/W2995181338","https://openalex.org/W3016010032","https://openalex.org/W3097777922","https://openalex.org/W3197580070","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3211278025","https://openalex.org/W4226507725","https://openalex.org/W4252812408","https://openalex.org/W4281492411","https://openalex.org/W4283324001","https://openalex.org/W4283700324","https://openalex.org/W4319862670","https://openalex.org/W4375869113","https://openalex.org/W4375869165","https://openalex.org/W4385823152","https://openalex.org/W4385823192","https://openalex.org/W4391021652","https://openalex.org/W4391021746","https://openalex.org/W4392903704","https://openalex.org/W4392909760","https://openalex.org/W4402112280","https://openalex.org/W4412886767"],"related_works":[],"abstract_inverted_index":{"Self-supervised":[0],"learning":[1],"(SSL)":[2],"has":[3],"achieved":[4],"great":[5],"success":[6],"in":[7,24,31,35,48,93,135],"speech-related":[8],"tasks.":[9,81],"While":[10],"Transformer":[11],"and":[12,71,85,97,113],"Conformer":[13],"architectures":[14],"have":[15],"dominated":[16],"SSL":[17,40,88,101],"backbones,":[18],"encoders":[19],"like":[20],"Zipformer,":[21],"which":[22],"excel":[23],"automatic":[25],"speech":[26,74],"recognition":[27],"(ASR),":[28],"remain":[29],"unexplored":[30],"SSL.":[32],"Concurrently,":[33],"inefficiencies":[34],"data":[36],"processing":[37],"within":[38],"existing":[39],"training":[41,54,95],"frameworks,":[42],"such":[43],"as":[44],"fairseq,":[45],"pose":[46],"challenges":[47],"managing":[49],"the":[50],"growing":[51],"volumes":[52],"of":[53,143],"data.":[55],"To":[56],"address":[57],"these":[58],"issues,":[59],"we":[60],"propose":[61],"k2SSL,":[62],"an":[63],"open-source":[64],"framework":[65],"that":[66,107],"offers":[67],"faster,":[68],"more":[69],"memory-efficient,":[70],"better-performing":[72],"self-supervised":[73],"representation":[75],"learning,":[76],"focusing":[77],"on":[78,104],"downstream":[79],"ASR":[80],"The":[82],"optimized":[83],"HuBERT":[84,112,125,152],"proposed":[86],"Zipformer-based":[87],"systems":[89],"exhibit":[90],"substantial":[91],"reductions":[92],"both":[94],"time":[96],"memory":[98],"usage":[99],"during":[100],"training.":[102],"Experiments":[103],"LibriSpeech":[105],"demonstrate":[106],"Zipformer":[108,146],"Base":[109,126],"significantly":[110],"outperforms":[111],"WavLM,":[114],"achieving":[115],"up":[116],"to":[117,124,140],"a":[118,131],"34.8%":[119],"relative":[120],"WER":[121],"reduction":[122],"compared":[123],"after":[127],"fine-tuning,":[128],"along":[129],"with":[130],"3.5x":[132],"pre-training":[133,159],"speedup":[134],"GPU":[136],"hours.":[137],"When":[138],"scaled":[139],"60k":[141],"hours":[142],"LibriLight":[144],"data,":[145],"Large":[147],"exhibits":[148],"remarkable":[149],"efficiency,":[150],"matching":[151],"Large\u2019s":[153],"performance":[154],"while":[155],"requiring":[156],"only":[157],"5/8":[158],"steps.":[160]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-30T00:00:00"}
