{"id":"https://openalex.org/W2892300106","doi":"https://doi.org/10.1109/icassp.2018.8461921","title":"Temporal Modeling Using Dilated Convolution and Gating for Voice-Activity-Detection","display_name":"Temporal Modeling Using Dilated Convolution and Gating for Voice-Activity-Detection","publication_year":2018,"publication_date":"2018-04-01","ids":{"openalex":"https://openalex.org/W2892300106","doi":"https://doi.org/10.1109/icassp.2018.8461921","mag":"2892300106"},"language":"en","primary_location":{"id":"doi:10.1109/icassp.2018.8461921","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2018.8461921","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001306222","display_name":"Shuo-Yiin Chang","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Shuo-Yiin Chang","raw_affiliation_strings":["Google Inc., U.S.A"],"affiliations":[{"raw_affiliation_string":"Google Inc., U.S.A","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046508099","display_name":"Bo Li","orcid":"https://orcid.org/0000-0002-1415-4444"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bo Li","raw_affiliation_strings":["Google Inc., U.S.A"],"affiliations":[{"raw_affiliation_string":"Google Inc., U.S.A","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050938071","display_name":"Gabor Simko","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gabor Simko","raw_affiliation_strings":["Google Inc., U.S.A"],"affiliations":[{"raw_affiliation_string":"Google Inc., U.S.A","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070513394","display_name":"Tara N. Sainath","orcid":"https://orcid.org/0000-0002-4126-6556"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tara N. Sainath","raw_affiliation_strings":["Google Inc., U.S.A"],"affiliations":[{"raw_affiliation_string":"Google Inc., U.S.A","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101403998","display_name":"Anshuman Tripathi","orcid":"https://orcid.org/0000-0002-4902-3719"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anshuman Tripathi","raw_affiliation_strings":["Google Inc., U.S.A"],"affiliations":[{"raw_affiliation_string":"Google Inc., U.S.A","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039776447","display_name":"A\u00e4ron van den Oord","orcid":null},"institutions":[{"id":"https://openalex.org/I4210113297","display_name":"Google (United Kingdom)","ror":"https://ror.org/024bc3e07","country_code":"GB","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210113297","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Aaron van den Oord","raw_affiliation_strings":["Google DeepMind, London, U.K"],"affiliations":[{"raw_affiliation_string":"Google DeepMind, London, U.K","institution_ids":["https://openalex.org/I4210113297","https://openalex.org/I4210090411"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003562101","display_name":"Oriol Vinyals","orcid":"https://orcid.org/0000-0001-7848-7283"},"institutions":[{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I4210113297","display_name":"Google (United Kingdom)","ror":"https://ror.org/024bc3e07","country_code":"GB","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210113297","https://openalex.org/I4210128969"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Oriol Vinyals","raw_affiliation_strings":["Google DeepMind, London, U.K"],"affiliations":[{"raw_affiliation_string":"Google DeepMind, London, U.K","institution_ids":["https://openalex.org/I4210113297","https://openalex.org/I4210090411"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5001306222"],"corresponding_institution_ids":["https://openalex.org/I1291425158"],"apc_list":null,"apc_paid":null,"fwci":7.8173,"has_fulltext":false,"cited_by_count":80,"citation_normalized_percentile":{"value":0.98092572,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"5549","last_page":"5553"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8359121084213257},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7760531306266785},{"id":"https://openalex.org/keywords/jitter","display_name":"Jitter","score":0.5620412826538086},{"id":"https://openalex.org/keywords/microphone","display_name":"Microphone","score":0.525409460067749},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.525036096572876},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.4979407787322998},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.450811505317688},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4281983971595764},{"id":"https://openalex.org/keywords/subnet","display_name":"Subnet","score":0.4192715287208557},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39380156993865967},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.3577566146850586},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.33898645639419556},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.33561980724334717}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8359121084213257},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7760531306266785},{"id":"https://openalex.org/C134652429","wikidata":"https://www.wikidata.org/wiki/Q1052698","display_name":"Jitter","level":2,"score":0.5620412826538086},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.525409460067749},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.525036096572876},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.4979407787322998},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.450811505317688},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4281983971595764},{"id":"https://openalex.org/C21099817","wikidata":"https://www.wikidata.org/wiki/Q7631721","display_name":"Subnet","level":2,"score":0.4192715287208557},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39380156993865967},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3577566146850586},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.33898645639419556},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.33561980724334717},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C68115822","wikidata":"https://www.wikidata.org/wiki/Q1068172","display_name":"Sound pressure","level":2,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp.2018.8461921","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2018.8461921","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1533861849","https://openalex.org/W1536583098","https://openalex.org/W1594494252","https://openalex.org/W1686810756","https://openalex.org/W1689711448","https://openalex.org/W1999454387","https://openalex.org/W2048060899","https://openalex.org/W2053168335","https://openalex.org/W2064675550","https://openalex.org/W2108289863","https://openalex.org/W2117671523","https://openalex.org/W2168231600","https://openalex.org/W2194775991","https://openalex.org/W2295098554","https://openalex.org/W2402146185","https://openalex.org/W2412782625","https://openalex.org/W2513345070","https://openalex.org/W2519091744","https://openalex.org/W2617258110","https://openalex.org/W2625979394","https://openalex.org/W2734724284","https://openalex.org/W2742061524","https://openalex.org/W2962719052","https://openalex.org/W2963840672","https://openalex.org/W3032942295","https://openalex.org/W6631943919","https://openalex.org/W6632152668","https://openalex.org/W6650031217","https://openalex.org/W6655731121","https://openalex.org/W6684859321","https://openalex.org/W6697089127"],"related_works":["https://openalex.org/W2122030153","https://openalex.org/W2162449135","https://openalex.org/W642007152","https://openalex.org/W2401827384","https://openalex.org/W2355290951","https://openalex.org/W2052688117","https://openalex.org/W2552102772","https://openalex.org/W2103239478","https://openalex.org/W4294771049","https://openalex.org/W1523214805"],"abstract_inverted_index":{"Voice":[0,149],"activity":[1],"detection":[2],"(VAD)":[3],"is":[4,20,47,81],"the":[5,32,38,79,91,154,179,183],"task":[6,151],"of":[7,11,54,165],"predicting":[8],"which":[9,27],"parts":[10],"an":[12,21,103],"utterance":[13,80],"contains":[14],"speech":[15],"versus":[16],"background":[17],"noise.":[18],"It":[19],"important":[22],"first":[23],"step":[24],"to":[25,29,31,36,94],"determine":[26],"samples":[28],"send":[30],"decoder":[33],"and":[34,57,88,142],"when":[35,78],"close":[37],"microphone.":[39],"The":[40,125],"long":[41,82],"short-term":[42],"memory":[43],"neural":[44,122],"network":[45,123],"(LSTM)":[46],"a":[48,118,147,163],"popular":[49],"architecture":[50,105,127,156,185],"for":[51,84,170],"sequential":[52],"modeling":[53,114],"acoustic":[55],"signals,":[56],"has":[58,68],"been":[59,69],"successfully":[60],"used":[61],"in":[62,132],"several":[63],"VAD":[64,171],"applications.":[65],"However,":[66],"it":[67,135],"observed":[70],"that":[71,106,153,181],"LSTMs":[72,169],"suffer":[73,109],"from":[74,110,129,186],"state":[75,93],"saturation":[76,111],"problems":[77,112],"(i.e.,":[83],"voice":[85],"dictation":[86],"tasks),":[87],"thus":[89],"requires":[90],"LSTM":[92],"be":[95],"periodically":[96],"reset.":[97],"In":[98],"this":[99],"paper,":[100],"we":[101],"propose":[102],"alternative":[104],"does":[107],"not":[108],"by":[113],"temporal":[115],"variations":[116],"through":[117],"stateless":[119],"dilated":[120,137],"convolution":[121],"(CNN).":[124],"proposed":[126,155,184],"differs":[128],"conventional":[130,187],"CNNs":[131],"three":[133],"respects:":[134],"uses":[136],"causal":[138],"convolution,":[139],"gated":[140],"activations":[141],"residual":[143],"connections.":[144],"Results":[145],"on":[146],"Google":[148],"Typing":[150],"shows":[152],"achieves":[157],"14%":[158],"relative":[159],"FA":[160],"improvement":[161],"at":[162],"FR":[164],"1%":[166],"over":[167],"state-of-the-art":[168],"task.":[172],"We":[173],"also":[174],"include":[175],"detailed":[176],"experiments":[177],"investigating":[178],"factors":[180],"distinguish":[182],"convolution.":[188]},"counts_by_year":[{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":9},{"year":2022,"cited_by_count":11},{"year":2021,"cited_by_count":16},{"year":2020,"cited_by_count":19},{"year":2019,"cited_by_count":12}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
