{"id":"https://openalex.org/W3142067363","doi":"https://doi.org/10.1109/slt48900.2021.9383626","title":"Frame-Level Specaugment for Deep Convolutional Neural Networks in Hybrid ASR Systems","display_name":"Frame-Level Specaugment for Deep Convolutional Neural Networks in Hybrid ASR Systems","publication_year":2021,"publication_date":"2021-01-19","ids":{"openalex":"https://openalex.org/W3142067363","doi":"https://doi.org/10.1109/slt48900.2021.9383626","mag":"3142067363"},"language":"en","primary_location":{"id":"doi:10.1109/slt48900.2021.9383626","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt48900.2021.9383626","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100461522","display_name":"Xinwei Li","orcid":"https://orcid.org/0009-0005-9677-0011"},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Xinwei Li","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100320635","display_name":"Yuanyuan Zhang","orcid":"https://orcid.org/0000-0001-9738-604X"},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yuanyuan Zhang","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110376025","display_name":"Xiaodan Zhuang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Xiaodan Zhuang","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069329694","display_name":"Daben Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Daben Liu","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100461522"],"corresponding_institution_ids":["https://openalex.org/I4210107260"],"apc_list":null,"apc_paid":null,"fwci":0.5439,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.71388648,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"209","last_page":"214"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.820117175579071},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.728387713432312},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7231364846229553},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.6045350432395935},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5803050994873047},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5705150365829468},{"id":"https://openalex.org/keywords/masking","display_name":"Masking (illustration)","score":0.5475746393203735},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.5434766411781311},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.5262554883956909},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5008723735809326},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.45078718662261963},{"id":"https://openalex.org/keywords/deep-neural-networks","display_name":"Deep neural networks","score":0.44977447390556335},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.3689919710159302},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.36195945739746094}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.820117175579071},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.728387713432312},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7231364846229553},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.6045350432395935},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5803050994873047},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5705150365829468},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.5475746393203735},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.5434766411781311},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.5262554883956909},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5008723735809326},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.45078718662261963},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.44977447390556335},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3689919710159302},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.36195945739746094},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt48900.2021.9383626","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt48900.2021.9383626","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5400000214576721}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W1828163288","https://openalex.org/W1877570817","https://openalex.org/W1922655562","https://openalex.org/W2099621636","https://openalex.org/W2114016253","https://openalex.org/W2131342762","https://openalex.org/W2157749010","https://openalex.org/W2194775991","https://openalex.org/W2327501763","https://openalex.org/W2397147568","https://openalex.org/W2403195671","https://openalex.org/W2405883473","https://openalex.org/W2407080277","https://openalex.org/W2514966966","https://openalex.org/W2515863432","https://openalex.org/W2526425061","https://openalex.org/W2617258110","https://openalex.org/W2696967604","https://openalex.org/W2750499125","https://openalex.org/W2936774411","https://openalex.org/W2940200615","https://openalex.org/W2962760690","https://openalex.org/W2962824709","https://openalex.org/W2963454111","https://openalex.org/W2973215447","https://openalex.org/W2983434507","https://openalex.org/W3001899777","https://openalex.org/W3008525923","https://openalex.org/W3014408449","https://openalex.org/W3015480556","https://openalex.org/W3015537910","https://openalex.org/W3015726069","https://openalex.org/W3015995734","https://openalex.org/W3029982911","https://openalex.org/W3096104971","https://openalex.org/W6638749077","https://openalex.org/W6640090968","https://openalex.org/W6675409298","https://openalex.org/W6679429981","https://openalex.org/W6713762819","https://openalex.org/W6713844269","https://openalex.org/W6728030952","https://openalex.org/W6739879593","https://openalex.org/W6775557069"],"related_works":["https://openalex.org/W2529301793","https://openalex.org/W2384121599","https://openalex.org/W2038083449","https://openalex.org/W3177678247","https://openalex.org/W1999617572","https://openalex.org/W2944572343","https://openalex.org/W2333799855","https://openalex.org/W2351687372","https://openalex.org/W2004087835","https://openalex.org/W3000197790"],"abstract_inverted_index":{"Inspired":[0],"by":[1,116],"SpecAugment":[2,16,82],"-":[3],"a":[4,14],"data":[5,137,181],"augmentation":[6,130],"method":[7,17],"for":[8,29,83,123,182],"end-to-end":[9],"ASR":[10,33,121],"systems,":[11],"we":[12],"propose":[13],"frame-level":[15],"(f-SpecAugment)":[18],"to":[19,36,64,105,133,175],"improve":[20],"the":[21,37,55,58,79,91,127,140,149,177],"performance":[22],"of":[23,53,108,129,151,179],"deep":[24,84,183],"convolutional":[25],"neural":[26],"networks":[27],"(CNN)":[28],"hybrid":[30,87],"HMM":[31],"based":[32,86],"systems.":[34],"Similar":[35],"utterance":[38,59,80],"level":[39,81],"SpecAugment,":[40],"f-SpecAugment":[41,61,74,93,113,162,170],"performs":[42],"three":[43],"transformations:":[44],"time":[45,50],"warping,":[46],"frequency":[47],"masking,":[48],"and":[49],"masking.":[51],"Instead":[52],"applying":[54],"transformations":[56],"at":[57],"level,":[60],"applies":[62],"them":[63],"each":[65],"convolution":[66],"window":[67],"independently":[68],"during":[69],"training.":[70],"We":[71,89,111,166],"demonstrate":[72,155,168],"that":[73,156,169],"is":[75,145,163],"more":[76],"effective":[77],"than":[78],"CNN":[85,98],"models.":[88],"evaluate":[90],"proposed":[92],"on":[94],"50-layer":[95],"Self-Normalizing":[96],"Deep":[97],"(SNDCNN)":[99],"acoustic":[100],"models":[101],"trained":[102],"with":[103,158],"up":[104],"25000":[106],"hours":[107],"training":[109,136,143,160,180],"data.":[110],"observe":[112],"reduces":[114],"WER":[115],"0.5-4.5%":[117],"relatively":[118],"across":[119],"different":[120],"tasks":[122],"four":[124],"languages.":[125],"As":[126],"benefits":[128,172],"techniques":[131],"tend":[132],"diminish":[134],"as":[135],"size":[138],"increases,":[139],"large":[141],"scale":[142],"reported":[144],"important":[146],"in":[147],"understanding":[148],"effectiveness":[150],"f-SpecAugment.":[152],"Our":[153],"experiments":[154],"even":[157],"25k":[159],"data,":[161],"still":[164],"effective.":[165],"also":[167],"has":[171],"approximately":[173],"equivalent":[174],"doubling":[176],"amount":[178],"CNNs.":[184]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
