{"id":"https://openalex.org/W3095189764","doi":"https://doi.org/10.21437/interspeech.2020-1778","title":"Semantic Mask for Transformer Based End-to-End Speech Recognition","display_name":"Semantic Mask for Transformer Based End-to-End Speech Recognition","publication_year":2020,"publication_date":"2020-10-25","ids":{"openalex":"https://openalex.org/W3095189764","doi":"https://doi.org/10.21437/interspeech.2020-1778","mag":"3095189764"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2020-1778","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-1778","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101863385","display_name":"Chengyi Wang","orcid":"https://orcid.org/0000-0002-6780-9299"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chengyi Wang","raw_affiliation_strings":["Microsoft Research Asia, Beijing"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia, Beijing","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101709477","display_name":"Yu Wu","orcid":"https://orcid.org/0000-0002-5715-3011"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Wu","raw_affiliation_strings":["Microsoft Research Asia, Beijing"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia, Beijing","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014240896","display_name":"Yujiao Du","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yujiao Du","raw_affiliation_strings":["Beijing University of Posts and Telecommunications {v-chengw, Wu"],"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications {v-chengw, Wu","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365053","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-1089-9748"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft Speech and Language Group"],"affiliations":[{"raw_affiliation_string":"Microsoft Speech and Language Group","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101635405","display_name":"Shujie Liu","orcid":"https://orcid.org/0009-0008-0785-8882"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shujie Liu","raw_affiliation_strings":["Microsoft Research Asia, Beijing"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia, Beijing","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101607148","display_name":"Liang Lu","orcid":"https://orcid.org/0000-0003-4005-679X"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Liang Lu","raw_affiliation_strings":["Microsoft Speech and Language Group"],"affiliations":[{"raw_affiliation_string":"Microsoft Speech and Language Group","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010323000","display_name":"Shuo Ren","orcid":null},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuo Ren","raw_affiliation_strings":["Microsoft Research Asia, Beijing"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia, Beijing","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008578397","display_name":"Guoli Ye","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Guoli Ye","raw_affiliation_strings":["Microsoft Speech and Language Group"],"affiliations":[{"raw_affiliation_string":"Microsoft Speech and Language Group","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100329353","display_name":"Sheng Zhao","orcid":"https://orcid.org/0000-0002-9624-5381"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sheng Zhao","raw_affiliation_strings":["Microsoft Speech and Language Group"],"affiliations":[{"raw_affiliation_string":"Microsoft Speech and Language Group","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103036125","display_name":"Ming Zhou","orcid":"https://orcid.org/0009-0005-6873-5710"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ming Zhou","raw_affiliation_strings":["Microsoft Research Asia, Beijing"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia, Beijing","institution_ids":["https://openalex.org/I4210113369"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5103036125"],"corresponding_institution_ids":["https://openalex.org/I4210113369"],"apc_list":null,"apc_paid":null,"fwci":4.6396,"has_fulltext":false,"cited_by_count":41,"citation_normalized_percentile":{"value":0.9568989,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"971","last_page":"975"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9894999861717224,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9840999841690063,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.8926163911819458},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7016551494598389},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.653634250164032},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5326091051101685},{"id":"https://openalex.org/keywords/end-user","display_name":"End user","score":0.41734594106674194},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.28754857182502747},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.21052798628807068},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.1682462990283966},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.13935062289237976}],"concepts":[{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.8926163911819458},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7016551494598389},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.653634250164032},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5326091051101685},{"id":"https://openalex.org/C91262260","wikidata":"https://www.wikidata.org/wiki/Q528074","display_name":"End user","level":2,"score":0.41734594106674194},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.28754857182502747},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.21052798628807068},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.1682462990283966},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.13935062289237976},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2020-1778","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-1778","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5899999737739563,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1686810756","https://openalex.org/W1828163288","https://openalex.org/W2085628288","https://openalex.org/W2133564696","https://openalex.org/W2194775991","https://openalex.org/W2250357346","https://openalex.org/W2251321385","https://openalex.org/W2740433069","https://openalex.org/W2892009249","https://openalex.org/W2896457183","https://openalex.org/W2936774411","https://openalex.org/W2941814890","https://openalex.org/W2963362078","https://openalex.org/W2972818416","https://openalex.org/W2981857663","https://openalex.org/W2991213871","https://openalex.org/W3006827623","https://openalex.org/W3015995734","https://openalex.org/W3016010032","https://openalex.org/W3103005696","https://openalex.org/W4385245566","https://openalex.org/W4394666973"],"related_works":["https://openalex.org/W4299590256","https://openalex.org/W2951281592","https://openalex.org/W2166381389","https://openalex.org/W4284893874","https://openalex.org/W3163634122","https://openalex.org/W2919182614","https://openalex.org/W333503034","https://openalex.org/W2054736184","https://openalex.org/W3159728998","https://openalex.org/W2677083173"],"abstract_inverted_index":{"Attention-based":[0],"encoder-decoder":[1,128],"model":[2,52,112,141],"has":[3],"achieved":[4],"impressive":[5],"results":[6],"for":[7,80,142],"both":[8],"automatic":[9],"speech":[10],"recognition":[11],"(ASR)":[12],"and":[13,68,152,156],"text-to-speech":[14],"(TTS)":[15],"tasks.This":[16],"approach":[17,123],"takes":[18],"advantage":[19],"of":[20,24,44,61,84,133,168],"the":[21,29,32,36,42,49,59,92,111,115,119,127,139,158,162,166],"memorization":[22],"capacity":[23],"neural":[25,134],"networks":[26],"to":[27,35,55,90,96,109,113,126],"learn":[28],"mapping":[30],"from":[31,39],"input":[33,93],"sequence":[34,38],"output":[37,99],"scratch,":[40],"without":[41],"assumption":[43],"prior":[45],"knowledge":[46],"such":[47,82],"as":[48],"alignments.However,":[50],"this":[51,71,122,145],"is":[53,64,89,124],"prone":[54],"overfitting,":[56],"especially":[57],"when":[58],"amount":[60],"training":[62,81],"data":[63,154],"limited.Inspired":[65],"by":[66],"SpecAugment":[67],"BERT,":[69],"in":[70,107,144,165],"paper,":[72],"we":[73,137],"propose":[74],"a":[75,97,102,105],"semantic":[76],"mask":[77,91],"based":[78,117],"regularization":[79],"kind":[83],"end-toend":[85],"(E2E)":[86],"model.The":[87],"idea":[88],"features":[94],"corresponding":[95],"particular":[98],"token,":[100],"e.g.,":[101],"word":[103],"or":[104],"wordpiece,":[106],"order":[108],"encourage":[110],"fill":[114],"token":[116],"on":[118,149,161],"contextual":[120],"information.While":[121],"applicable":[125],"framework":[129],"with":[130],"any":[131],"type":[132],"network":[135],"architecture,":[136],"study":[138],"transformer-based":[140],"ASR":[143],"work.We":[146],"perform":[147],"experiments":[148],"Librispeech":[150],"960h":[151],"TedLium2":[153],"sets,":[155],"achieve":[157],"state-of-the-art":[159],"performance":[160],"test":[163],"set":[164],"scope":[167],"E2E":[169],"models.":[170]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":10},{"year":2021,"cited_by_count":9},{"year":2020,"cited_by_count":12}],"updated_date":"2026-01-26T23:06:41.788003","created_date":"2025-10-10T00:00:00"}
