{"id":"https://openalex.org/W4226052192","doi":"https://doi.org/10.1109/asru51503.2021.9688002","title":"A Comparison of Streaming Models and Data Augmentation Methods for Robust Speech Recognition","display_name":"A Comparison of Streaming Models and Data Augmentation Methods for Robust Speech Recognition","publication_year":2021,"publication_date":"2021-12-13","ids":{"openalex":"https://openalex.org/W4226052192","doi":"https://doi.org/10.1109/asru51503.2021.9688002"},"language":"en","primary_location":{"id":"doi:10.1109/asru51503.2021.9688002","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9688002","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100396580","display_name":"Jiyeon Kim","orcid":"https://orcid.org/0000-0001-7964-5060"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Jiyeon Kim","raw_affiliation_strings":["Samsung Research,Seoul,South Korea","Samsung Research, Seoul, South Korea"],"affiliations":[{"raw_affiliation_string":"Samsung Research,Seoul,South Korea","institution_ids":["https://openalex.org/I2250650973"]},{"raw_affiliation_string":"Samsung Research, Seoul, South Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065995328","display_name":"Mehul Kumar","orcid":"https://orcid.org/0000-0001-7341-2362"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Mehul Kumar","raw_affiliation_strings":["Samsung Research,Seoul,South Korea","Samsung Research, Seoul, South Korea"],"affiliations":[{"raw_affiliation_string":"Samsung Research,Seoul,South Korea","institution_ids":["https://openalex.org/I2250650973"]},{"raw_affiliation_string":"Samsung Research, Seoul, South Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113125574","display_name":"Dhananjaya Gowda","orcid":null},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Dhananjaya Gowda","raw_affiliation_strings":["Samsung Research,Seoul,South Korea","Samsung Research, Seoul, South Korea"],"affiliations":[{"raw_affiliation_string":"Samsung Research,Seoul,South Korea","institution_ids":["https://openalex.org/I2250650973"]},{"raw_affiliation_string":"Samsung Research, Seoul, South Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043920881","display_name":"Abhinav Garg","orcid":"https://orcid.org/0000-0001-5082-5500"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Abhinav Garg","raw_affiliation_strings":["Samsung Research,Seoul,South Korea","Samsung Research, Seoul, South Korea"],"affiliations":[{"raw_affiliation_string":"Samsung Research,Seoul,South Korea","institution_ids":["https://openalex.org/I2250650973"]},{"raw_affiliation_string":"Samsung Research, Seoul, South Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100684422","display_name":"Chanwoo Kim","orcid":"https://orcid.org/0000-0003-0193-8167"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Chanwoo Kim","raw_affiliation_strings":["Samsung Research,Seoul,South Korea","Samsung Research, Seoul, South Korea"],"affiliations":[{"raw_affiliation_string":"Samsung Research,Seoul,South Korea","institution_ids":["https://openalex.org/I2250650973"]},{"raw_affiliation_string":"Samsung Research, Seoul, South Korea","institution_ids":["https://openalex.org/I2250650973"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100396580"],"corresponding_institution_ids":["https://openalex.org/I2250650973"],"apc_list":null,"apc_paid":null,"fwci":0.377,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.62843522,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":"abs 1912 5533","issue":null,"first_page":"989","last_page":"995"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7913661003112793},{"id":"https://openalex.org/keywords/recurrent-neural-network","display_name":"Recurrent neural network","score":0.7552650570869446},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7288424968719482},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6389384269714355},{"id":"https://openalex.org/keywords/reverberation","display_name":"Reverberation","score":0.6190717220306396},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5492324233055115},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5414156913757324},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.46499529480934143},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4449634850025177},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4118034839630127},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3405612111091614},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09297946095466614}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7913661003112793},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.7552650570869446},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7288424968719482},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6389384269714355},{"id":"https://openalex.org/C95851461","wikidata":"https://www.wikidata.org/wiki/Q468809","display_name":"Reverberation","level":2,"score":0.6190717220306396},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5492324233055115},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5414156913757324},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.46499529480934143},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4449634850025177},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4118034839630127},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3405612111091614},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09297946095466614},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru51503.2021.9688002","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9688002","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.699999988079071}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1509793305","https://openalex.org/W1828163288","https://openalex.org/W2099621636","https://openalex.org/W2127141656","https://openalex.org/W2326699523","https://openalex.org/W2327501763","https://openalex.org/W2526425061","https://openalex.org/W2551620312","https://openalex.org/W2605141709","https://openalex.org/W2617258110","https://openalex.org/W2622203030","https://openalex.org/W2627092829","https://openalex.org/W2936774411","https://openalex.org/W2962760690","https://openalex.org/W2963040451","https://openalex.org/W2963980003","https://openalex.org/W2972816482","https://openalex.org/W3007185811","https://openalex.org/W3007227084","https://openalex.org/W3008587939","https://openalex.org/W3015194534","https://openalex.org/W3015974384","https://openalex.org/W3015995734","https://openalex.org/W3095193086","https://openalex.org/W3095867028","https://openalex.org/W3096758108","https://openalex.org/W3168770049","https://openalex.org/W4294619417","https://openalex.org/W6638749077","https://openalex.org/W6675409298","https://openalex.org/W6729280272","https://openalex.org/W6735706088","https://openalex.org/W6738243166","https://openalex.org/W6738686518","https://openalex.org/W6739366949","https://openalex.org/W6747158283","https://openalex.org/W6771250757"],"related_works":["https://openalex.org/W1656519308","https://openalex.org/W2042717753","https://openalex.org/W2037265366","https://openalex.org/W2022849831","https://openalex.org/W2391832549","https://openalex.org/W2286653370","https://openalex.org/W3008625068","https://openalex.org/W3128807919","https://openalex.org/W3176411177","https://openalex.org/W3035501883"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3,102],"present":[4],"a":[5,170],"comparative":[6],"study":[7],"on":[8,82],"the":[9,68,75,79,83,119,125,132,149],"robustness":[10],"of":[11,78,85,106,121,127,144,151],"two":[12],"different":[13],"online":[14],"streaming":[15,174],"speech":[16,176],"recognition":[17,177],"models:":[18],"Monotonic":[19],"Chunkwise":[20],"Attention":[21],"(MoChA)":[22],"and":[23,51,124,148,162],"Recurrent":[24],"Neural":[25],"Network-Transducer":[26],"(RNN-T).":[27],"We":[28],"explore":[29],"three":[30],"recently":[31],"proposed":[32],"data":[33,90],"augmentation":[34,91],"techniques,":[35],"namely,":[36],"multi-conditioned":[37],"training":[38,69,86,105,122],"using":[39],"an":[40],"acoustic":[41],"simulator,":[42],"Vocal":[43],"Tract":[44],"Length":[45],"Perturbation":[46],"(VTLP)":[47],"for":[48,173],"speaker":[49],"variability,":[50],"SpecAugment.":[52],"Experimental":[53],"results":[54],"show":[55],"that":[56,74,104],"unidirectional":[57],"models":[58,94,108,136,141,155,169],"are":[59,156],"in":[60,67,142],"general":[61],"more":[62,112,158],"sensitive":[63,113],"to":[64,110,114,179],"noisy":[65],"examples":[66,87],"set.":[70],"It":[71],"is":[72],"observed":[73],"final":[76],"performance":[77],"model":[80],"depends":[81],"proportion":[84],"processed":[88],"by":[89],"techniques.":[92,130],"MoChA":[93,107,140,180],"generally":[95,157],"perform":[96,137],"better":[97,138,171],"than":[98,139],"RNN-T":[99,135,154,168],"models.":[100,181],"However,":[101],"observe":[103],"seems":[109],"be":[111],"various":[115],"factors":[116],"such":[117],"as":[118],"characteristics":[120],"sets":[123],"incorporation":[126],"additional":[128],"augmentations":[129],"On":[131],"other":[133],"hand,":[134],"terms":[143],"latency,":[145],"inference":[146],"time,":[147],"stability":[150],"training.":[152],"Additionally,":[153],"robust":[159],"against":[160],"noise":[161],"reverberation.":[163],"All":[164],"these":[165],"advantages":[166],"make":[167],"choice":[172],"on-device":[175],"compared":[178]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
