{"id":"https://openalex.org/W2406262283","doi":"https://doi.org/10.21437/interspeech.2013-203","title":"Speech activity detection on youtube using deep neural networks","display_name":"Speech activity detection on youtube using deep neural networks","publication_year":2013,"publication_date":"2013-08-25","ids":{"openalex":"https://openalex.org/W2406262283","doi":"https://doi.org/10.21437/interspeech.2013-203","mag":"2406262283"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2013-203","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2013-203","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2013","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088320958","display_name":"Neville Ryant","orcid":"https://orcid.org/0000-0003-2545-6912"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Neville Ryant","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055868875","display_name":"Mark Liberman","orcid":"https://orcid.org/0000-0002-8605-9024"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mark Liberman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5043406907","display_name":"Jiahong Yuan","orcid":"https://orcid.org/0009-0008-2162-7167"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiahong Yuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":7.4078,"has_fulltext":false,"cited_by_count":100,"citation_normalized_percentile":{"value":0.97759815,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"728","last_page":"731"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11233","display_name":"Advanced Adaptive Filtering Techniques","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7602406740188599},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5752277970314026},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.5527676939964294},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.47069454193115234},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4118293523788452},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.2674033045768738}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7602406740188599},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5752277970314026},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.5527676939964294},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.47069454193115234},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4118293523788452},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2674033045768738}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2013-203","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2013-203","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2013","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W100598793","https://openalex.org/W1665214252","https://openalex.org/W1946893126","https://openalex.org/W2023582935","https://openalex.org/W2070176749","https://openalex.org/W2097553100","https://openalex.org/W2098265087","https://openalex.org/W2130426352","https://openalex.org/W2136922672","https://openalex.org/W2138029428","https://openalex.org/W2147768505","https://openalex.org/W2172070182","https://openalex.org/W2252254117","https://openalex.org/W2402212287","https://openalex.org/W2403186097","https://openalex.org/W2805889152","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W3135230428","https://openalex.org/W2904739811","https://openalex.org/W2099105119","https://openalex.org/W2559837139","https://openalex.org/W2152158029","https://openalex.org/W2012540220","https://openalex.org/W2081671587","https://openalex.org/W1151175420","https://openalex.org/W2131711534","https://openalex.org/W2541680182"],"abstract_inverted_index":{"Speech":[0],"activity":[1],"detection":[2],"(SAD)":[3],"is":[4,51,85],"an":[5,96],"important":[6],"first":[7],"step":[8],"in":[9,75],"speech":[10],"processing.":[11],"Commonly":[12],"used":[13],"methods":[14],"(e.g.,":[15,62],"frame-level":[16],"classification":[17],"using":[18,95],"gaussian":[19],"mixture":[20],"models":[21],"(GMMs))":[22],"work":[23],"well":[24,33],"under":[25],"stationary":[26],"noise":[27],"conditions,":[28],"but":[29],"do":[30],"not":[31],"generalize":[32],"to":[34,52,73,86,136],"domains":[35],"such":[36,99],"as":[37,100],"YouTube,":[38],"where":[39],"videos":[40,134],"may":[41],"exhibit":[42],"a":[43,111,137],"diverse":[44],"range":[45],"of":[46,93,116,119],"environmental":[47],"conditions.":[48],"One":[49],"solution":[50],"augment":[53],"the":[54,91],"conventional":[55,138],"cepstral":[56,122],"features":[57,61,89],"with":[58,113],"additional,":[59],"hand-engineered":[60],"spectral":[63,65,68],"flux,":[64],"centroid,":[66],"multiband":[67],"entropies)":[69],"which":[70],"are":[71],"robust":[72,88],"changes":[74],"environment":[76],"and":[77],"recording":[78],"condition.":[79],"An":[80],"alternative":[81],"approach,":[82],"explored":[83],"here,":[84],"learn":[87],"during":[90],"course":[92],"training":[94],"appropriate":[97],"architecture":[98],"deep":[101],"neural":[102],"networks":[103],"(DNNs).":[104],"In":[105],"this":[106],"paper":[107],"we":[108],"demonstrate":[109],"that":[110],"DNN":[112],"input":[114],"consisting":[115],"multiple":[117],"frames":[118],"mel":[120],"frequency":[121],"coefficients":[123],"(MFCCs)":[124],"yields":[125],"drastically":[126],"lower":[127],"frame-wise":[128],"error":[129],"rates":[130],"(19.6%)":[131],"on":[132],"YouTube":[133],"compared":[135],"GMM":[139],"based":[140],"system":[141],"(40%).":[142]},"counts_by_year":[{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":10},{"year":2020,"cited_by_count":12},{"year":2019,"cited_by_count":9},{"year":2018,"cited_by_count":20},{"year":2017,"cited_by_count":11},{"year":2016,"cited_by_count":11},{"year":2015,"cited_by_count":6},{"year":2014,"cited_by_count":5},{"year":2013,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
