{"id":"https://openalex.org/W4223655162","doi":"https://doi.org/10.21437/interspeech.2022-856","title":"Personal VAD 2.0: Optimizing Personal Voice Activity Detection for On-Device Speech Recognition","display_name":"Personal VAD 2.0: Optimizing Personal Voice Activity Detection for On-Device Speech Recognition","publication_year":2022,"publication_date":"2022-09-16","ids":{"openalex":"https://openalex.org/W4223655162","doi":"https://doi.org/10.21437/interspeech.2022-856"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2022-856","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-856","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058886181","display_name":"Shaojin Ding","orcid":"https://orcid.org/0000-0002-2108-3111"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shaojin Ding","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003481800","display_name":"Rajeev Rikhye","orcid":"https://orcid.org/0000-0003-2011-2897"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rajeev Rikhye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026297587","display_name":"Qiao Liang","orcid":"https://orcid.org/0000-0003-4464-4644"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao Liang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101153566","display_name":"Yanzhang He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yanzhang He","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101887853","display_name":"Quan Wang","orcid":"https://orcid.org/0000-0003-1051-8287"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Quan Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000078382","display_name":"Arun Narayanan","orcid":"https://orcid.org/0009-0008-3325-8928"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Arun Narayanan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042631153","display_name":"Tom O\u2019Malley","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tom O\u2019Malley","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5048236120","display_name":"Ian McGraw","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ian McGraw","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5058886181"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.9156,"has_fulltext":false,"cited_by_count":29,"citation_normalized_percentile":{"value":0.93232609,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"3744","last_page":"3748"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9871000051498413,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.7781505584716797},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7426742315292358},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6859685778617859},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.47124868631362915},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.29726529121398926}],"concepts":[{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.7781505584716797},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7426742315292358},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6859685778617859},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.47124868631362915},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.29726529121398926}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2022-856","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-856","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.5400000214576721,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W1491159402","https://openalex.org/W4297807400","https://openalex.org/W4313854686","https://openalex.org/W2499802997","https://openalex.org/W3162054169","https://openalex.org/W1813780412","https://openalex.org/W289407349","https://openalex.org/W2029134149","https://openalex.org/W2368768466","https://openalex.org/W2757081366"],"abstract_inverted_index":{"Personalization":[0],"of":[1,19,48,54,68,133,174],"on-device":[2,57],"speech":[3,167],"recognition":[4,168],"(ASR)":[5],"has":[6],"seen":[7],"explosive":[8],"growth":[9],"in":[10,85,93,103],"recent":[11],"years,":[12],"largely":[13],"due":[14],"to":[15,77,116,147,149],"the":[16,45,66,88,109,126,171],"increasing":[17],"popularity":[18],"personal":[20],"assistant":[21],"features":[22],"on":[23,164],"mobile":[24],"devices":[25],"and":[26,96,107,121,154,159],"smart":[27],"home":[28],"speakers.":[29],"In":[30],"this":[31,80],"work,":[32],"we":[33,129],"present":[34],"Personal":[35,69],"VAD":[36],"2.0,":[37],"a":[38,49,55,104,118,131,143,165],"personalized":[39],"voice":[40,46],"activity":[41,47],"detector":[42],"that":[43],"detects":[44],"target":[50],"speaker,":[51],"as":[52],"part":[53],"streaming":[56,105],"ASR":[58],"system.":[59],"Although":[60],"previous":[61],"proof-of-concept":[62],"studies":[63],"have":[64],"validated":[65],"effectiveness":[67],"VAD,":[70],"there":[71],"are":[72],"still":[73],"several":[74],"critical":[75],"challenges":[76],"address":[78],"before":[79],"model":[81,110],"can":[82],"be":[83,91,113],"used":[84],"production:":[86],"first,":[87],"quality":[89],"must":[90],"satisfactory":[92],"both":[94],"enrollment":[95],"enrollment-less":[97,150],"scenarios;":[98],"second,":[99],"it":[100],"should":[101,112],"operate":[102],"fashion;":[106],"finally,":[108],"size":[111],"small":[114],"enough":[115],"fit":[117],"limited":[119],"latency":[120,158],"CPU/Memory":[122],"budget.":[123],"To":[124],"meet":[125],"multi-faceted":[127],"requirements,":[128],"propose":[130],"series":[132],"novel":[134],"designs:":[135],"1)":[136],"advanced":[137],"speaker":[138],"embedding":[139],"modulation":[140],"methods;":[141],"2)":[142],"new":[144],"training":[145],"paradigm":[146],"generalize":[148],"conditions;":[151],"3)":[152],"architecture":[153],"runtime":[155],"optimizations":[156],"for":[157],"resource":[160],"restrictions.":[161],"Extensive":[162],"experiments":[163],"realistic":[166],"system":[169],"demonstrated":[170],"state-of-the-art":[172],"performance":[173],"our":[175],"proposed":[176],"method.":[177]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":12},{"year":2023,"cited_by_count":5}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
