{"id":"https://openalex.org/W7148241091","doi":"https://doi.org/10.1109/asru65441.2025.11434717","title":"Masked Self-distilled Transducer-based Keyword Spotting with Semi-autoregressive Decoding","display_name":"Masked Self-distilled Transducer-based Keyword Spotting with Semi-autoregressive Decoding","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148241091","doi":"https://doi.org/10.1109/asru65441.2025.11434717"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434717","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434717","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132801627","display_name":"Yu Xi","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yu Xi","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067918077","display_name":"Xun Gu","orcid":"https://orcid.org/0000-0002-1711-5192"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyu Gu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132808358","display_name":"Haoyu Li","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoyu Li","raw_affiliation_strings":["Taobao &#x0026; Tmall Group of Alibaba,China"],"affiliations":[{"raw_affiliation_string":"Taobao &#x0026; Tmall Group of Alibaba,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073421844","display_name":"J. J. Song","orcid":"https://orcid.org/0000-0002-7154-6153"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Song","raw_affiliation_strings":["Taobao &#x0026; Tmall Group of Alibaba,China"],"affiliations":[{"raw_affiliation_string":"Taobao &#x0026; Tmall Group of Alibaba,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132792341","display_name":"Bo Zheng","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Zheng","raw_affiliation_strings":["Taobao &#x0026; Tmall Group of Alibaba,China"],"affiliations":[{"raw_affiliation_string":"Taobao &#x0026; Tmall Group of Alibaba,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132801590","display_name":"Kai Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Yu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5132801627"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":2.1819,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.92934746,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.807699978351593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.807699978351593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.04540000110864639,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12131","display_name":"Wireless Signal Modulation Classification","score":0.019300000742077827,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.9327999949455261},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.911300003528595},{"id":"https://openalex.org/keywords/keyword-spotting","display_name":"Keyword spotting","score":0.8781999945640564},{"id":"https://openalex.org/keywords/spotting","display_name":"Spotting","score":0.49140000343322754},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.41440001130104065},{"id":"https://openalex.org/keywords/simplicity","display_name":"Simplicity","score":0.4066999852657318}],"concepts":[{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.9327999949455261},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.911300003528595},{"id":"https://openalex.org/C2781213101","wikidata":"https://www.wikidata.org/wiki/Q6398558","display_name":"Keyword spotting","level":2,"score":0.8781999945640564},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8033999800682068},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6114000082015991},{"id":"https://openalex.org/C2779506182","wikidata":"https://www.wikidata.org/wiki/Q7580141","display_name":"Spotting","level":2,"score":0.49140000343322754},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4456000030040741},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4187000095844269},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.41440001130104065},{"id":"https://openalex.org/C2776372474","wikidata":"https://www.wikidata.org/wiki/Q508291","display_name":"Simplicity","level":2,"score":0.4066999852657318},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.36039999127388},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3368000090122223},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.30720001459121704}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434717","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434717","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2034940213","https://openalex.org/W2407080277","https://openalex.org/W2507319753","https://openalex.org/W2748659049","https://openalex.org/W2936774411","https://openalex.org/W2963308316","https://openalex.org/W2963628261","https://openalex.org/W2969991587","https://openalex.org/W2976556660","https://openalex.org/W3015639015","https://openalex.org/W3015686596","https://openalex.org/W3016010032","https://openalex.org/W3095173472","https://openalex.org/W3097777922","https://openalex.org/W3161425572","https://openalex.org/W3163582231","https://openalex.org/W4224137820","https://openalex.org/W4224932156","https://openalex.org/W4283700324","https://openalex.org/W4283828241","https://openalex.org/W4293363567","https://openalex.org/W4391021514","https://openalex.org/W4392904393","https://openalex.org/W4402111963","https://openalex.org/W4412129261"],"related_works":[],"abstract_inverted_index":{"RNN-T-based":[0],"keyword":[1],"spotting":[2],"(KWS)":[3],"with":[4],"autoregressive":[5],"decoding":[6,86,112,120],"(AR)":[7],"has":[8],"gained":[9],"attention":[10],"due":[11],"to":[12,59,88],"its":[13],"streaming":[14],"architecture":[15],"and":[16,94],"superior":[17,116],"performance.":[18,39],"However,":[19],"the":[20,23,72,90,115,124],"simplicity":[21],"of":[22,92,118,127],"prediction":[24,57],"network":[25],"in":[26,37],"RNN-T":[27,73],"poses":[28],"an":[29],"overfitting":[30,125],"issue,":[31],"especially":[32],"under":[33],"challenging":[34],"scenarios,":[35],"resulting":[36],"degraded":[38],"In":[40,79],"this":[41],"paper,":[42],"we":[43,81],"propose":[44,82],"a":[45,83],"masked":[46,65],"self-distillation":[47],"(MSD)":[48],"training":[49,63,106],"strategy":[50],"that":[51,104],"avoids":[52],"RNN-Ts":[53],"overly":[54],"relying":[55],"on":[56],"networks":[58],"alleviate":[60],"overfitting.":[61,109],"Such":[62],"enables":[64],"non-autoregressive":[66],"(NAR)":[67],"decoding,":[68,129],"which":[69],"fully":[70],"masks":[71],"predictor":[74],"output":[75],"during":[76],"KWS":[77,101],"decoding.":[78,96],"addition,":[80],"semi-autoregressive":[84],"(SAR)":[85],"approach":[87],"integrate":[89],"advantages":[91],"AR":[93,119],"NAR":[95,128],"Our":[97],"experiments":[98],"across":[99],"multiple":[100],"datasets":[102],"demonstrate":[103],"MSD":[105],"effectively":[107],"alleviates":[108],"The":[110],"SAR":[111],"method":[113],"preserves":[114],"performance":[117],"while":[121],"benefits":[122],"from":[123],"suppression":[126],"achieving":[130],"excellent":[131],"results.":[132]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2026-04-03T00:00:00"}
