{"id":"https://openalex.org/W3191151211","doi":"https://doi.org/10.1109/taslp.2021.3104193","title":"Learning Waveform-Based Acoustic Models Using Deep Variational Convolutional Neural Networks","display_name":"Learning Waveform-Based Acoustic Models Using Deep Variational Convolutional Neural Networks","publication_year":2021,"publication_date":"2021-01-01","ids":{"openalex":"https://openalex.org/W3191151211","doi":"https://doi.org/10.1109/taslp.2021.3104193","mag":"3191151211"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2021.3104193","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2021.3104193","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://kclpure.kcl.ac.uk/portal/en/publications/17e7016d-f44e-4141-85ba-ab6ab9667ee9","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Dino Oglic","orcid":"https://orcid.org/0000-0002-4728-9644"},"institutions":[{"id":"https://openalex.org/I183935753","display_name":"King's College London","ror":"https://ror.org/0220mzb33","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I183935753"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Dino Oglic","raw_affiliation_strings":["Department of Engineering, King\u2019s College London, London, U.K"],"affiliations":[{"raw_affiliation_string":"Department of Engineering, King\u2019s College London, London, U.K","institution_ids":["https://openalex.org/I183935753"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zoran Cvetkovic","orcid":"https://orcid.org/0000-0002-5128-5099"},"institutions":[{"id":"https://openalex.org/I183935753","display_name":"King's College London","ror":"https://ror.org/0220mzb33","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I183935753"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Zoran Cvetkovic","raw_affiliation_strings":["Department of Engineering, King\u2019s College London, London, U.K"],"affiliations":[{"raw_affiliation_string":"Department of Engineering, King\u2019s College London, London, U.K","institution_ids":["https://openalex.org/I183935753"]}]},{"author_position":"last","author":{"id":null,"display_name":"Peter Sollich","orcid":null},"institutions":[{"id":"https://openalex.org/I183935753","display_name":"King's College London","ror":"https://ror.org/0220mzb33","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I183935753"]},{"id":"https://openalex.org/I4210119896","display_name":"King's College School","ror":"https://ror.org/02bbqcn27","country_code":"GB","type":"education","lineage":["https://openalex.org/I4210119896"]},{"id":"https://openalex.org/I74656192","display_name":"University of G\u00f6ttingen","ror":"https://ror.org/01y9bpm73","country_code":"DE","type":"education","lineage":["https://openalex.org/I74656192"]}],"countries":["DE","GB"],"is_corresponding":false,"raw_author_name":"Peter Sollich","raw_affiliation_strings":["Department of Mathematics, King\u2019s College London, London, U.K","Institute for Theoretical Physics, University of G\u00f6ttingen, G\u00f6ttingen, Germany"],"affiliations":[{"raw_affiliation_string":"Department of Mathematics, King\u2019s College London, London, U.K","institution_ids":["https://openalex.org/I4210119896","https://openalex.org/I183935753"]},{"raw_affiliation_string":"Institute for Theoretical Physics, University of G\u00f6ttingen, G\u00f6ttingen, Germany","institution_ids":["https://openalex.org/I74656192"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I183935753"],"apc_list":null,"apc_paid":null,"fwci":1.1197,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.82363432,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":97},"biblio":{"volume":"29","issue":null,"first_page":"2850","last_page":"2863"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8198000192642212,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8198000192642212,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.0778999999165535,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.01889999955892563,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.5453000068664551},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5090000033378601},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.5077000260353088},{"id":"https://openalex.org/keywords/parametric-statistics","display_name":"Parametric statistics","score":0.47360000014305115},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.4390999972820282},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.4099999964237213},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4009000062942505},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3986000120639801},{"id":"https://openalex.org/keywords/recurrent-neural-network","display_name":"Recurrent neural network","score":0.3736000061035156}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7109000086784363},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.5453000068664551},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5440000295639038},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5090000033378601},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.5077000260353088},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.47360000014305115},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.4390999972820282},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.4099999964237213},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4009000062942505},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3986000120639801},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.3736000061035156},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3695000112056732},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.3433000147342682},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3395000100135803},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3269999921321869},{"id":"https://openalex.org/C24574437","wikidata":"https://www.wikidata.org/wiki/Q7135228","display_name":"Parametric model","level":3,"score":0.31869998574256897},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.3109000027179718},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.30230000615119934},{"id":"https://openalex.org/C132459708","wikidata":"https://www.wikidata.org/wiki/Q744069","display_name":"Extrapolation","level":2,"score":0.3003999888896942},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.29980000853538513},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.2849999964237213},{"id":"https://openalex.org/C175202392","wikidata":"https://www.wikidata.org/wiki/Q2434543","display_name":"Time delay neural network","level":3,"score":0.2815000116825104},{"id":"https://openalex.org/C102248274","wikidata":"https://www.wikidata.org/wiki/Q168388","display_name":"Adaptive filter","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.26989999413490295},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.25679999589920044},{"id":"https://openalex.org/C202887219","wikidata":"https://www.wikidata.org/wiki/Q3895221","display_name":"Parametrization (atmospheric modeling)","level":3,"score":0.2515999972820282},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25110000371932983}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/taslp.2021.3104193","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2021.3104193","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:kclpure.kcl.ac.uk:openaire/17e7016d-f44e-4141-85ba-ab6ab9667ee9","is_oa":true,"landing_page_url":"https://kclpure.kcl.ac.uk/portal/en/publications/17e7016d-f44e-4141-85ba-ab6ab9667ee9","pdf_url":null,"source":{"id":"https://openalex.org/S4306400216","display_name":"Research Portal (King's College London)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I183935753","host_organization_name":"King's College London","host_organization_lineage":["https://openalex.org/I183935753"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Oglic, D, Cvetkovic, Z & Sollich, P 2021, 'Learning Waveform-Based Acoustic Models using Deep Variational Convolutional Neural Networks', IEEE/ACM Transactions on Audio, Speech, and Language Processing , vol. 29, pp. 2850-2863. https://doi.org/10.1109/TASLP.2021.3104193","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:arXiv.org:1906.09526","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1906.09526","pdf_url":"https://arxiv.org/pdf/1906.09526","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:publications.goettingen-research-online.de:2/135671","is_oa":true,"landing_page_url":"https://resolver.sub.uni-goettingen.de/purl?gro-2/135671","pdf_url":null,"source":{"id":"https://openalex.org/S4306401634","display_name":"GoeScholar  The Publication Server of the Georg-August-Universit\u00e4t G\u00f6ttingen (Georg-August-Universit\u00e4t G\u00f6ttingen)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210122495","host_organization_name":"Asklepios Klinik St. Georg","host_organization_lineage":["https://openalex.org/I4210122495"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"info:eu-repo/semantics/article"}],"best_oa_location":{"id":"pmh:oai:kclpure.kcl.ac.uk:openaire/17e7016d-f44e-4141-85ba-ab6ab9667ee9","is_oa":true,"landing_page_url":"https://kclpure.kcl.ac.uk/portal/en/publications/17e7016d-f44e-4141-85ba-ab6ab9667ee9","pdf_url":null,"source":{"id":"https://openalex.org/S4306400216","display_name":"Research Portal (King's College London)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I183935753","host_organization_name":"King's College London","host_organization_lineage":["https://openalex.org/I183935753"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Oglic, D, Cvetkovic, Z & Sollich, P 2021, 'Learning Waveform-Based Acoustic Models using Deep Variational Convolutional Neural Networks', IEEE/ACM Transactions on Audio, Speech, and Language Processing , vol. 29, pp. 2850-2863. https://doi.org/10.1109/TASLP.2021.3104193","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4304376995","display_name":"SpeechWave","funder_award_id":"EP/R012067/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G6167047368","display_name":null,"funder_award_id":"EP/R012067/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":74,"referenced_works":["https://openalex.org/W1494192115","https://openalex.org/W1542280630","https://openalex.org/W1565746575","https://openalex.org/W1590183771","https://openalex.org/W1994906459","https://openalex.org/W1999974018","https://openalex.org/W2001414552","https://openalex.org/W2018586823","https://openalex.org/W2032558547","https://openalex.org/W2078279533","https://openalex.org/W2089624763","https://openalex.org/W2093231248","https://openalex.org/W2116217121","https://openalex.org/W2118020555","https://openalex.org/W2131548063","https://openalex.org/W2133815075","https://openalex.org/W2144908784","https://openalex.org/W2148154194","https://openalex.org/W2154833897","https://openalex.org/W2167270514","https://openalex.org/W2261689926","https://openalex.org/W2335317505","https://openalex.org/W2408093180","https://openalex.org/W2508048623","https://openalex.org/W2514741789","https://openalex.org/W2515753980","https://openalex.org/W2587210085","https://openalex.org/W2794209590","https://openalex.org/W2799958557","https://openalex.org/W2888909726","https://openalex.org/W2936481169","https://openalex.org/W2937814805","https://openalex.org/W2939173691","https://openalex.org/W2939776061","https://openalex.org/W2942544643","https://openalex.org/W2955174253","https://openalex.org/W2962901777","https://openalex.org/W2963071736","https://openalex.org/W2963669405","https://openalex.org/W2973053574","https://openalex.org/W3006835112","https://openalex.org/W3094855746","https://openalex.org/W4205130185","https://openalex.org/W4211049957","https://openalex.org/W4246858143","https://openalex.org/W4251742697","https://openalex.org/W4252684946","https://openalex.org/W4301866048","https://openalex.org/W6605799290","https://openalex.org/W6631190155","https://openalex.org/W6631362777","https://openalex.org/W6633882626","https://openalex.org/W6637600633","https://openalex.org/W6638836233","https://openalex.org/W6676315081","https://openalex.org/W6678409544","https://openalex.org/W6681599957","https://openalex.org/W6684488266","https://openalex.org/W6712560600","https://openalex.org/W6712930963","https://openalex.org/W6732256780","https://openalex.org/W6732814185","https://openalex.org/W6740512805","https://openalex.org/W6750407360","https://openalex.org/W6751030608","https://openalex.org/W6754929158","https://openalex.org/W6756426640","https://openalex.org/W6757084628","https://openalex.org/W6757107679","https://openalex.org/W6757424787","https://openalex.org/W6763271563","https://openalex.org/W6769645241","https://openalex.org/W6780226713","https://openalex.org/W6784486210"],"related_works":[],"abstract_inverted_index":{"We":[0,73,130],"investigate":[1],"the":[2,55,70,120,127,137,143,158,174,185,201],"potential":[3],"of":[4,30,61,64,100,136,152,184,213],"stochastic":[5,146],"neural":[6,52,78,139,209],"networks":[7],"for":[8,163,165,211],"learning":[9,71,212],"effective":[10,170],"waveform-based":[11,15,190],"acoustic":[12,215],"models.":[13],"The":[14,104],"setting,":[16],"inherent":[17],"to":[18,112,198],"fully":[19],"end-to-end":[20],"speech":[21,34,83],"recognition":[22,35],"systems,":[23],"is":[24],"motivated":[25],"by":[26,97,126],"several":[27],"comparative":[28],"studies":[29],"automatic":[31],"and":[32,141,192],"human":[33],"that":[36,80,194],"associate":[37],"standard":[38,108,218],"non-adaptive":[39],"feature":[40],"extraction":[41],"techniques":[42],"with":[43,217],"information":[44],"loss,":[45],"which":[46,166],"can":[47],"adversely":[48],"affect":[49],"robustness.":[50,199],"Stochastic":[51],"networks,":[53],"on":[54,132,173],"other":[56],"hand,":[57],"are":[58,95],"a":[59,75,133,181,204],"class":[60],"models":[62,216],"capable":[63],"incorporating":[65],"rich":[66],"regularization":[67],"mechanisms":[68],"into":[69,84],"process.":[72],"consider":[74],"deep":[76,207],"convolutional":[77,91,208],"network":[79,105,210],"first":[81],"decomposes":[82],"frequency":[85],"sub-bands":[86],"via":[87],"an":[88,153,169],"adaptive":[89],"parametric":[90,128],"block":[92],"where":[93],"filters":[94],"specified":[96],"cosine":[98],"modulations":[99],"compactly":[101],"supported":[102],"windows.":[103],"then":[106],"employs":[107],"non-parametric":[109],"1D":[110],"convolutions":[111],"extract":[113],"relevant":[114],"spectro-temporal":[115],"patterns":[116],"while":[117],"gradually":[118],"compressing":[119],"structured":[121],"high":[122],"dimensional":[123],"representation":[124],"generated":[125],"block.":[129],"rely":[131],"probabilistic":[134],"parametrization":[135],"proposed":[138,186,206],"architecture":[140],"learn":[142],"model":[144],"using":[145],"variational":[147],"inference.":[148],"This":[149],"requires":[150],"evaluation":[151],"analytically":[154],"intractable":[155],"integral":[156],"defining":[157],"Kullback-Leibler":[159],"divergence":[160],"term":[161],"responsible":[162],"regularization,":[164],"we":[167],"propose":[168],"approximation":[171],"based":[172],"Gauss-Hermite":[175],"quadrature.":[176],"Our":[177],"empirical":[178],"results":[179],"demonstrate":[180],"superior":[182],"performance":[183],"approach":[187,202],"over":[188],"comparable":[189],"baselines":[191],"indicate":[193],"it":[195],"could":[196],"lead":[197],"Moreover,":[200],"outperforms":[203],"recently":[205],"robust":[214],"FBANK":[219],"features.":[220]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":3}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2021-08-16T00:00:00"}
