{"id":"https://openalex.org/W2810313973","doi":"https://doi.org/10.1109/taslp.2018.2848698","title":"Leveraging Frequency-Dependent Kernel and DIP-Based Clustering for Robust Speech Activity Detection in Naturalistic Audio Streams","display_name":"Leveraging Frequency-Dependent Kernel and DIP-Based Clustering for Robust Speech Activity Detection in Naturalistic Audio Streams","publication_year":2018,"publication_date":"2018-07-02","ids":{"openalex":"https://openalex.org/W2810313973","doi":"https://doi.org/10.1109/taslp.2018.2848698","mag":"2810313973"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2018.2848698","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2018.2848698","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033701513","display_name":"Harishchandra Dubey","orcid":"https://orcid.org/0000-0003-0476-3884"},"institutions":[{"id":"https://openalex.org/I162577319","display_name":"The University of Texas at Dallas","ror":"https://ror.org/049emcs32","country_code":"US","type":"education","lineage":["https://openalex.org/I162577319"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Harishchandra Dubey","raw_affiliation_strings":["Robust Speech Technologies Lab, Center for Robust Speech Systems, The University of Texas at Dallas, Richardson, TX, USA"],"affiliations":[{"raw_affiliation_string":"Robust Speech Technologies Lab, Center for Robust Speech Systems, The University of Texas at Dallas, Richardson, TX, USA","institution_ids":["https://openalex.org/I162577319"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111469714","display_name":"Abhijeet Sangwan","orcid":null},"institutions":[{"id":"https://openalex.org/I162577319","display_name":"The University of Texas at Dallas","ror":"https://ror.org/049emcs32","country_code":"US","type":"education","lineage":["https://openalex.org/I162577319"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Abhijeet Sangwan","raw_affiliation_strings":["Robust Speech Technologies Lab, Center for Robust Speech Systems, The University of Texas at Dallas, Richardson, TX, USA"],"affiliations":[{"raw_affiliation_string":"Robust Speech Technologies Lab, Center for Robust Speech Systems, The University of Texas at Dallas, Richardson, TX, USA","institution_ids":["https://openalex.org/I162577319"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057910370","display_name":"John H. L. Hansen","orcid":"https://orcid.org/0000-0003-1382-9929"},"institutions":[{"id":"https://openalex.org/I162577319","display_name":"The University of Texas at Dallas","ror":"https://ror.org/049emcs32","country_code":"US","type":"education","lineage":["https://openalex.org/I162577319"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"John H. L. Hansen","raw_affiliation_strings":["Robust Speech Technologies Lab, Center for Robust Speech Systems, The University of Texas at Dallas, Richardson, TX, USA"],"affiliations":[{"raw_affiliation_string":"Robust Speech Technologies Lab, Center for Robust Speech Systems, The University of Texas at Dallas, Richardson, TX, USA","institution_ids":["https://openalex.org/I162577319"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5033701513"],"corresponding_institution_ids":["https://openalex.org/I162577319"],"apc_list":null,"apc_paid":null,"fwci":1.6288,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.87595134,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":"26","issue":"11","first_page":"2056","last_page":"2071"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7941446304321289},{"id":"https://openalex.org/keywords/nist","display_name":"NIST","score":0.6996481418609619},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6431370973587036},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.593624472618103},{"id":"https://openalex.org/keywords/mixture-model","display_name":"Mixture model","score":0.5635949373245239},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5384592413902283},{"id":"https://openalex.org/keywords/mel-frequency-cepstrum","display_name":"Mel-frequency cepstrum","score":0.4886663258075714},{"id":"https://openalex.org/keywords/spectral-clustering","display_name":"Spectral clustering","score":0.43621695041656494},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.416454553604126},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.37512195110321045},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.34242916107177734},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.2661128640174866}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7941446304321289},{"id":"https://openalex.org/C111219384","wikidata":"https://www.wikidata.org/wiki/Q6954384","display_name":"NIST","level":2,"score":0.6996481418609619},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6431370973587036},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.593624472618103},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.5635949373245239},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5384592413902283},{"id":"https://openalex.org/C151989614","wikidata":"https://www.wikidata.org/wiki/Q440370","display_name":"Mel-frequency cepstrum","level":3,"score":0.4886663258075714},{"id":"https://openalex.org/C105611402","wikidata":"https://www.wikidata.org/wiki/Q2976589","display_name":"Spectral clustering","level":3,"score":0.43621695041656494},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.416454553604126},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.37512195110321045},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34242916107177734},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2661128640174866},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2018.2848698","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2018.2848698","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.7900000214576721}],"awards":[{"id":"https://openalex.org/G8087301405","display_name":null,"funder_award_id":"FA8750-15-1-0205","funder_id":"https://openalex.org/F4320338294","funder_display_name":"Air Force Research Laboratory"}],"funders":[{"id":"https://openalex.org/F4320338294","display_name":"Air Force Research Laboratory","ror":"https://ror.org/02e2egq70"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":72,"referenced_works":["https://openalex.org/W1536583098","https://openalex.org/W1974387177","https://openalex.org/W1985242443","https://openalex.org/W1994387177","https://openalex.org/W1997817740","https://openalex.org/W2007119422","https://openalex.org/W2009150118","https://openalex.org/W2023582935","https://openalex.org/W2041823554","https://openalex.org/W2056871913","https://openalex.org/W2058732030","https://openalex.org/W2062826588","https://openalex.org/W2095755195","https://openalex.org/W2098265087","https://openalex.org/W2102346872","https://openalex.org/W2109440058","https://openalex.org/W2115008841","https://openalex.org/W2115717467","https://openalex.org/W2120480179","https://openalex.org/W2125114513","https://openalex.org/W2126693545","https://openalex.org/W2129120544","https://openalex.org/W2137075158","https://openalex.org/W2141102245","https://openalex.org/W2157394313","https://openalex.org/W2158146178","https://openalex.org/W2158777221","https://openalex.org/W2161025448","https://openalex.org/W2171903461","https://openalex.org/W2240641835","https://openalex.org/W2242685705","https://openalex.org/W2295098554","https://openalex.org/W2353261545","https://openalex.org/W2398971481","https://openalex.org/W2401364490","https://openalex.org/W2403004229","https://openalex.org/W2403186097","https://openalex.org/W2489277755","https://openalex.org/W2509661423","https://openalex.org/W2514407679","https://openalex.org/W2515632544","https://openalex.org/W2515750888","https://openalex.org/W2525821395","https://openalex.org/W2541334028","https://openalex.org/W2556948255","https://openalex.org/W2564171085","https://openalex.org/W2586681212","https://openalex.org/W2606468144","https://openalex.org/W2738707637","https://openalex.org/W2786608204","https://openalex.org/W2787053546","https://openalex.org/W2800837202","https://openalex.org/W2801018181","https://openalex.org/W2887979278","https://openalex.org/W2895902173","https://openalex.org/W2942177450","https://openalex.org/W2963947576","https://openalex.org/W2976826970","https://openalex.org/W3098028374","https://openalex.org/W3127686677","https://openalex.org/W6675700563","https://openalex.org/W6697089127","https://openalex.org/W6712618806","https://openalex.org/W6712995851","https://openalex.org/W6713037448","https://openalex.org/W6731070234","https://openalex.org/W6748195155","https://openalex.org/W6750904958","https://openalex.org/W6753916174","https://openalex.org/W6755319750","https://openalex.org/W6973666849","https://openalex.org/W7034203491"],"related_works":["https://openalex.org/W2158491338","https://openalex.org/W2807901368","https://openalex.org/W2133733652","https://openalex.org/W2072658171","https://openalex.org/W2606392311","https://openalex.org/W2320042380","https://openalex.org/W2048014685","https://openalex.org/W3009295899","https://openalex.org/W2370972896","https://openalex.org/W1482912984"],"abstract_inverted_index":{"Speech":[0,168],"activity":[1],"detection":[2],"(SAD)":[3],"is":[4,49,116,122],"front-end":[5],"in":[6,131],"most":[7],"speech":[8,13,31],"systems,":[9],"e.g.,":[10],"speaker":[11,146],"verification,":[12],"recognition":[14],"etc.":[15],"Supervised":[16],"SAD":[17,46,48,67,136,143,160,185],"typically":[18],"leverages":[19],"machine":[20],"learning":[21],"models":[22],"trained":[23],"on":[24,144,187],"annotated":[25],"data.":[26,150],"For":[27],"applications":[28],"like":[29],"zero-resource":[30],"processing":[32],"and":[33,107,138,153,174,209],"NIST-OpenSAT-2017":[34,154],"public":[35],"safety":[36],"communications":[37],"task,":[38],"it":[39],"might":[40],"not":[41],"be":[42],"feasible":[43],"to":[44],"collect":[45],"annotations.":[47],"challenging":[50],"for":[51,128,158,166],"naturalistic":[52,177,188],"audio":[53,189],"streams":[54],"containing":[55],"multiple":[56,200],"noise-sources":[57],"simultaneously.":[58],"We":[59,94,124,162,191],"propose":[60],"a":[61,117],"novel":[62],"frequency-dependent":[63],"kernel":[64],"(FDK)":[65],"based":[66],"features.":[68,93,212],"FDK":[69,81],"provides":[70],"enhanced":[71],"spectral":[72],"decomposition":[73],"from":[74],"which":[75],"several":[76],"statistical":[77,82],"descriptors":[78,83],"are":[79,84,156],"derived.":[80],"combined":[85],"by":[86],"principal":[87],"component":[88],"analysis":[89],"into":[90],"one-dimensional":[91],"FDK-SAD":[92],"further":[95],"proposed":[96,197],"two":[97,132,164],"decision":[98],"backends:":[99],"First,":[100],"variable":[101],"model-size":[102],"Gaussian":[103,206],"mixture":[104,207],"model":[105],"(VMGMM);":[106],"second,":[108,139],"Hartigan":[109],"dip-based":[110],"robust":[111],"feature":[112],"clustering.":[113],"While":[114],"VMGMM":[115],"model-based":[118],"approach,":[119],"the":[120,140,196],"DipSAD":[121],"nonparametric.":[123],"used":[125,157],"both":[126],"backends":[127],"comparative":[129,193],"evaluations":[130,186],"phases:":[133],"first,":[134],"standalone":[135,159,184],"performance;":[137],"effect":[141],"of":[142,195],"text-dependent":[145],"verification":[147],"using":[148],"RedDots":[149],"The":[151,180],"NIST-OpenSAD-2015":[152],"corpora":[155,171,182],"evaluations.":[161],"establish":[163],"Center":[165],"Robust":[167],"Systems":[169],"(CRSS)":[170],"namely":[172],"CRSS-PLTL-II":[173],"CRSS":[175,181],"long-duration":[176],"noise":[178],"corpus.":[179],"facilitate":[183],"streams.":[190],"performed":[192],"studies":[194],"approaches":[198],"with":[199],"baselines":[201],"including":[202],"SohnSAD,":[203],"rSAD,":[204],"semisupervised":[205],"model,":[208],"Gammatone":[210],"spectrogram":[211]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":4},{"year":2019,"cited_by_count":3},{"year":2018,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
