{"id":"https://openalex.org/W4408529258","doi":"https://doi.org/10.3390/info16030233","title":"VAD-CLVA: Integrating CLIP with LLaVA for Voice Activity Detection","display_name":"VAD-CLVA: Integrating CLIP with LLaVA for Voice Activity Detection","publication_year":2025,"publication_date":"2025-03-16","ids":{"openalex":"https://openalex.org/W4408529258","doi":"https://doi.org/10.3390/info16030233"},"language":"en","primary_location":{"id":"doi:10.3390/info16030233","is_oa":true,"landing_page_url":"https://doi.org/10.3390/info16030233","pdf_url":"https://www.mdpi.com/2078-2489/16/3/233/pdf?version=1742111411","source":{"id":"https://openalex.org/S4210219776","display_name":"Information","issn_l":"2078-2489","issn":["2078-2489"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Information","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.mdpi.com/2078-2489/16/3/233/pdf?version=1742111411","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114494466","display_name":"Andrea Appiani","orcid":"https://orcid.org/0009-0007-6359-9611"},"institutions":[{"id":"https://openalex.org/I11039511","display_name":"University of Bergamo","ror":"https://ror.org/02mbd5571","country_code":"IT","type":"education","lineage":["https://openalex.org/I11039511"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Andrea Appiani","raw_affiliation_strings":["Department of Management, Information and Production Engineering, University of Bergamo, 24127 Dalmine, Italy"],"affiliations":[{"raw_affiliation_string":"Department of Management, Information and Production Engineering, University of Bergamo, 24127 Dalmine, Italy","institution_ids":["https://openalex.org/I11039511"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057859690","display_name":"Cigdem Beyan","orcid":"https://orcid.org/0000-0002-9583-0087"},"institutions":[{"id":"https://openalex.org/I119439378","display_name":"University of Verona","ror":"https://ror.org/039bp8j42","country_code":"IT","type":"education","lineage":["https://openalex.org/I119439378"]}],"countries":["IT"],"is_corresponding":true,"raw_author_name":"Cigdem Beyan","raw_affiliation_strings":["Department of Computer Science, University of Verona, 37134 Verona, Italy"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Verona, 37134 Verona, Italy","institution_ids":["https://openalex.org/I119439378"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5057859690"],"corresponding_institution_ids":["https://openalex.org/I119439378"],"apc_list":{"value":1400,"currency":"CHF","value_usd":1515},"apc_paid":{"value":1400,"currency":"CHF","value_usd":1515},"fwci":1.544,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.78630186,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"16","issue":"3","first_page":"233","last_page":"233"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.5166038274765015},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4819871783256531},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4696452021598816},{"id":"https://openalex.org/keywords/communication","display_name":"Communication","score":0.37814217805862427},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.24606600403785706},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.06116524338722229}],"concepts":[{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.5166038274765015},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4819871783256531},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4696452021598816},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.37814217805862427},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.24606600403785706},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.06116524338722229}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.3390/info16030233","is_oa":true,"landing_page_url":"https://doi.org/10.3390/info16030233","pdf_url":"https://www.mdpi.com/2078-2489/16/3/233/pdf?version=1742111411","source":{"id":"https://openalex.org/S4210219776","display_name":"Information","issn_l":"2078-2489","issn":["2078-2489"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Information","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:fc61ef7ffb924312a0b771267d1f5275","is_oa":true,"landing_page_url":"https://doaj.org/article/fc61ef7ffb924312a0b771267d1f5275","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Information, Vol 16, Iss 3, p 233 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.3390/info16030233","is_oa":true,"landing_page_url":"https://doi.org/10.3390/info16030233","pdf_url":"https://www.mdpi.com/2078-2489/16/3/233/pdf?version=1742111411","source":{"id":"https://openalex.org/S4210219776","display_name":"Information","issn_l":"2078-2489","issn":["2078-2489"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Information","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4408529258.pdf"},"referenced_works_count":59,"referenced_works":["https://openalex.org/W1013543043","https://openalex.org/W2002979515","https://openalex.org/W2064194796","https://openalex.org/W2065980576","https://openalex.org/W2084237818","https://openalex.org/W2090088747","https://openalex.org/W2104380558","https://openalex.org/W2109788549","https://openalex.org/W2222534143","https://openalex.org/W2316138215","https://openalex.org/W2323437298","https://openalex.org/W2330149154","https://openalex.org/W2610846088","https://openalex.org/W2749694333","https://openalex.org/W2759799350","https://openalex.org/W2768817490","https://openalex.org/W2793493229","https://openalex.org/W2913526742","https://openalex.org/W2963470929","https://openalex.org/W2965002127","https://openalex.org/W2971784756","https://openalex.org/W2991853946","https://openalex.org/W3010594275","https://openalex.org/W3016098309","https://openalex.org/W3035875334","https://openalex.org/W3038871978","https://openalex.org/W3041847644","https://openalex.org/W3104119587","https://openalex.org/W3112188842","https://openalex.org/W3116298410","https://openalex.org/W3119269912","https://openalex.org/W3135367836","https://openalex.org/W3136499730","https://openalex.org/W3172472082","https://openalex.org/W3189964604","https://openalex.org/W3198668286","https://openalex.org/W3198675127","https://openalex.org/W3201927751","https://openalex.org/W3206008172","https://openalex.org/W3207207922","https://openalex.org/W3216551675","https://openalex.org/W4214601882","https://openalex.org/W4283811195","https://openalex.org/W4286378963","https://openalex.org/W4291920479","https://openalex.org/W4308824155","https://openalex.org/W4312740349","https://openalex.org/W4365807578","https://openalex.org/W4386597066","https://openalex.org/W4388999300","https://openalex.org/W4390190404","https://openalex.org/W4390873118","https://openalex.org/W4390874206","https://openalex.org/W4390889814","https://openalex.org/W4392402421","https://openalex.org/W4394604181","https://openalex.org/W4402727764","https://openalex.org/W6735927292","https://openalex.org/W6804336870"],"related_works":["https://openalex.org/W191108438","https://openalex.org/W3135230428","https://openalex.org/W2904739811","https://openalex.org/W249088392","https://openalex.org/W2152158029","https://openalex.org/W2012540220","https://openalex.org/W2131711534","https://openalex.org/W2559837139","https://openalex.org/W1151175420","https://openalex.org/W2407342067"],"abstract_inverted_index":{"Voice":[0],"activity":[1],"detection":[2],"(VAD)":[3],"is":[4,13],"the":[5,17,81,88,102,132],"process":[6],"of":[7,19,84,135],"automatically":[8],"determining":[9],"whether":[10],"a":[11,63,96,117],"person":[12],"speaking":[14],"and":[15,105,154],"identifying":[16],"timing":[18],"their":[20],"speech":[21],"in":[22,58],"an":[23,85],"audiovisual":[24],"data.":[25],"Traditionally,":[26],"this":[27],"task":[28],"has":[29],"been":[30],"tackled":[31],"by":[32,41,95],"processing":[33],"either":[34],"audio":[35],"signals":[36],"or":[37,40,47],"visual":[38,74,141],"data,":[39],"combining":[42],"both":[43],"modalities":[44],"through":[45,116],"fusion":[46],"joint":[48],"learning.":[49],"In":[50],"our":[51,136,145],"study,":[52],"drawing":[53],"inspiration":[54],"from":[55,111],"recent":[56],"advancements":[57],"visual-language":[59],"models,":[60],"we":[61],"introduce":[62],"novel":[64],"approach":[65,146],"leveraging":[66],"Contrastive":[67],"Language-Image":[68],"Pretraining":[69],"(CLIP)":[70],"models.":[71],"The":[72],"CLIP":[73],"encoder":[75,90],"analyzes":[76],"video":[77],"segments":[78],"focusing":[79],"on":[80,158],"upper":[82],"body":[83],"individual,":[86],"while":[87],"text":[89],"processes":[91],"textual":[92],"descriptions":[93],"generated":[94],"Generative":[97],"Large":[98,103],"Multimodal":[99],"Model,":[100],"i.e.,":[101],"Language":[104],"Vision":[106],"Assistant":[107],"(LLaVA).":[108],"Subsequently,":[109],"embeddings":[110],"these":[112],"encoders":[113],"are":[114],"fused":[115],"deep":[118],"neural":[119],"network":[120],"to":[121,139],"perform":[122],"VAD.":[123],"Our":[124],"experimental":[125],"analysis":[126],"across":[127],"three":[128],"VAD":[129,142],"benchmarks":[130],"showcases":[131],"superior":[133],"performance":[134],"method":[137],"compared":[138],"existing":[140],"approaches.":[143],"Notably,":[144],"outperforms":[147],"several":[148],"audio-visual":[149,160],"methods":[150],"despite":[151],"its":[152],"simplicity":[153],"without":[155],"requiring":[156],"pretraining":[157],"extensive":[159],"datasets.":[161]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-02-27T16:54:17.756197","created_date":"2025-10-10T00:00:00"}
