{"id":"https://openalex.org/W3163211337","doi":"https://doi.org/10.1109/icassp39728.2021.9414639","title":"Multistream CNN for Robust Acoustic Modeling","display_name":"Multistream CNN for Robust Acoustic Modeling","publication_year":2021,"publication_date":"2021-05-13","ids":{"openalex":"https://openalex.org/W3163211337","doi":"https://doi.org/10.1109/icassp39728.2021.9414639","mag":"3163211337"},"language":"en","primary_location":{"id":"doi:10.1109/icassp39728.2021.9414639","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414639","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106859201","display_name":"Kyu J. Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kyu J. Han","raw_affiliation_strings":["ASAPP,Mountain View,CA,USA","ASAPP, Mountain View, CA, USA"],"affiliations":[{"raw_affiliation_string":"ASAPP,Mountain View,CA,USA","institution_ids":[]},{"raw_affiliation_string":"ASAPP, Mountain View, CA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101875418","display_name":"Jing Pan","orcid":"https://orcid.org/0000-0002-5178-2247"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jing Pan","raw_affiliation_strings":["ASAPP,Mountain View,CA,USA","ASAPP, Mountain View, CA, USA"],"affiliations":[{"raw_affiliation_string":"ASAPP,Mountain View,CA,USA","institution_ids":[]},{"raw_affiliation_string":"ASAPP, Mountain View, CA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087462573","display_name":"Venkata Krishna Naveen Tadala","orcid":null},"institutions":[{"id":"https://openalex.org/I4210111469","display_name":"Sensory Cloud (United States)","ror":"https://ror.org/01vny5262","country_code":"US","type":"company","lineage":["https://openalex.org/I4210111469"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Venkata Krishna Naveen Tadala","raw_affiliation_strings":["Sensory,Portland,OR,USA","Sensory, Portland, OR, USA"],"affiliations":[{"raw_affiliation_string":"Sensory,Portland,OR,USA","institution_ids":["https://openalex.org/I4210111469"]},{"raw_affiliation_string":"Sensory, Portland, OR, USA","institution_ids":["https://openalex.org/I4210111469"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101774785","display_name":"Tao Ma","orcid":"https://orcid.org/0000-0001-6135-9736"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao Ma","raw_affiliation_strings":["ASAPP,Mountain View,CA,USA","ASAPP, Mountain View, CA, USA"],"affiliations":[{"raw_affiliation_string":"ASAPP,Mountain View,CA,USA","institution_ids":[]},{"raw_affiliation_string":"ASAPP, Mountain View, CA, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5089659636","display_name":"Dan Povey","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dan Povey","raw_affiliation_strings":["Xiaomi Inc.,Beijing,China","Xiaomi Inc., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Xiaomi Inc.,Beijing,China","institution_ids":[]},{"raw_affiliation_string":"Xiaomi Inc., Beijing, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5106859201"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":3.9431,"has_fulltext":false,"cited_by_count":35,"citation_normalized_percentile":{"value":0.945907,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"6873","last_page":"6877"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8165934085845947},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7439236044883728},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6239498853683472},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.585088849067688},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4695359468460083},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4301595091819763},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.42595475912094116},{"id":"https://openalex.org/keywords/test-set","display_name":"Test set","score":0.41184598207473755}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8165934085845947},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7439236044883728},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6239498853683472},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.585088849067688},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4695359468460083},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4301595091819763},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42595475912094116},{"id":"https://openalex.org/C169903167","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Test set","level":2,"score":0.41184598207473755},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp39728.2021.9414639","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414639","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.44999998807907104,"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1524333225","https://openalex.org/W1552701380","https://openalex.org/W1553004968","https://openalex.org/W1920478749","https://openalex.org/W1922557984","https://openalex.org/W1984746632","https://openalex.org/W2097897204","https://openalex.org/W2101596234","https://openalex.org/W2102512139","https://openalex.org/W2143137842","https://openalex.org/W2153723410","https://openalex.org/W2167763959","https://openalex.org/W2396825535","https://openalex.org/W2397147568","https://openalex.org/W2399383297","https://openalex.org/W2470673105","https://openalex.org/W2514741789","https://openalex.org/W2514966966","https://openalex.org/W2559809918","https://openalex.org/W2799958557","https://openalex.org/W2888867175","https://openalex.org/W2889152503","https://openalex.org/W2899640612","https://openalex.org/W2936774411","https://openalex.org/W2963064586","https://openalex.org/W2963403868","https://openalex.org/W2973201641","https://openalex.org/W2977728428","https://openalex.org/W3006827623","https://openalex.org/W3095838132","https://openalex.org/W4299602298","https://openalex.org/W4302613066","https://openalex.org/W4385245566","https://openalex.org/W6631362777","https://openalex.org/W6632891567","https://openalex.org/W6633288045","https://openalex.org/W6675280436","https://openalex.org/W6713003356","https://openalex.org/W6739901393","https://openalex.org/W6756137172"],"related_works":["https://openalex.org/W4293226380","https://openalex.org/W2770593030","https://openalex.org/W4313906399","https://openalex.org/W4321487865","https://openalex.org/W4281727072","https://openalex.org/W2560201613","https://openalex.org/W3154990682","https://openalex.org/W2171975302","https://openalex.org/W2022352247","https://openalex.org/W2342291550"],"abstract_inverted_index":{"This":[0],"paper":[1],"proposes":[2],"multistream":[3,91,161,184],"CNN,":[4],"a":[5,53,132,137],"novel":[6],"neural":[7,35],"network":[8],"architecture":[9,20,93],"for":[10,131,143,187],"robust":[11],"acoustic":[12],"modeling":[13],"in":[14,116,153,198],"speech":[15,23],"recognition":[16],"tasks.":[17],"The":[18,44],"proposed":[19,90],"processes":[21],"input":[22],"with":[24,179],"diverse":[25],"temporal":[26],"resolutions":[27],"by":[28,94,120,167],"applying":[29],"different":[30],"dilation":[31,45],"rates":[32,46],"to":[33,40,80,147,151,189],"convolutional":[34],"networks":[36],"across":[37,103],"multiple":[38],"streams":[39,75],"achieve":[41,190],"the":[42,50,74,81,86,89,110,113,117,154,164,191],"robustness.":[43],"are":[47,76],"selected":[48],"from":[49,73,126],"multiples":[51],"of":[52,56,66,88,112,141,158,194],"sub-sampling":[54],"rate":[55],"3":[57],"frames.":[58],"Each":[59],"stream":[60],"stacks":[61],"TDNN-F":[62,101,166],"layers":[63],"(a":[64],"variant":[65],"1D":[67],"CNN),":[68],"and":[69],"output":[70],"embedding":[71],"vectors":[72],"concatenated":[77],"then":[78],"projected":[79],"final":[82],"layer.":[83],"We":[84],"validate":[85],"effectiveness":[87],"CNN":[92,108,162,185],"showing":[95],"consistent":[96],"improvements":[97],"against":[98],"Kaldi\u2019s":[99],"best":[100,192],"model":[102],"various":[104],"data":[105,125,152],"sets.":[106],"Multistream":[107],"improves":[109],"WER":[111,139,193],"test-other":[114],"set":[115],"LibriSpeech":[118],"corpus":[119],"12%":[121],"(relative).":[122],"On":[123],"custom":[124],"ASAPP\u2019s":[127],"production":[128,175],"ASR":[129],"system":[130],"contact":[133],"center,":[134],"it":[135],"records":[136],"relative":[138],"improvement":[140],"11%":[142],"customer":[144],"channel":[145],"audio":[146],"prove":[148],"its":[149,172],"robustness":[150],"wild.":[155],"In":[156],"terms":[157],"real-time":[159],"factor,":[160],"outperforms":[163],"baseline":[165],"15%,":[168],"which":[169],"also":[170],"suggests":[171],"practicality":[173],"on":[174,196],"systems.":[176],"When":[177],"combined":[178],"self-attentive":[180],"SRU":[181],"LM":[182],"rescoring,":[183],"contributes":[186],"ASAPP":[188],"1.75%":[195],"test-clean":[197],"LibriSpeech.":[199]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":13},{"year":2021,"cited_by_count":9},{"year":2020,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
