{"id":"https://openalex.org/W4392908942","doi":"https://doi.org/10.1109/icassp48485.2024.10447620","title":"Positive Transfer of the Whisper Speech Transformer to Human and Animal Voice Activity Detection","display_name":"Positive Transfer of the Whisper Speech Transformer to Human and Animal Voice Activity Detection","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392908942","doi":"https://doi.org/10.1109/icassp48485.2024.10447620"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10447620","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447620","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012125124","display_name":"Nianlong Gu","orcid":"https://orcid.org/0000-0002-8474-0836"},"institutions":[{"id":"https://openalex.org/I4210118033","display_name":"Infrastructure Management Consultants (Switzerland)","ror":"https://ror.org/027r3eh78","country_code":"CH","type":"company","lineage":["https://openalex.org/I4210118033"]},{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]}],"countries":["CH"],"is_corresponding":true,"raw_author_name":"Nianlong Gu","raw_affiliation_strings":["University of Zurich,Linguistic Research Infrastructure,Switzerland","Linguistic Research Infrastructure, University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich,Linguistic Research Infrastructure,Switzerland","institution_ids":["https://openalex.org/I4210118033"]},{"raw_affiliation_string":"Linguistic Research Infrastructure, University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113145351","display_name":"Kanghwi Lee","orcid":null},"institutions":[{"id":"https://openalex.org/I12708293","display_name":"SIB Swiss Institute of Bioinformatics","ror":"https://ror.org/002n09z45","country_code":"CH","type":"funder","lineage":["https://openalex.org/I12708293"]},{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]},{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Kanghwi Lee","raw_affiliation_strings":["University of Zurich and ETH Zurich,Institute of Neuroinformatics,Switzerland","Institute of Neuroinformatics, University of Zurich and ETH Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich and ETH Zurich,Institute of Neuroinformatics,Switzerland","institution_ids":["https://openalex.org/I12708293","https://openalex.org/I202697423","https://openalex.org/I35440088"]},{"raw_affiliation_string":"Institute of Neuroinformatics, University of Zurich and ETH Zurich, Switzerland","institution_ids":["https://openalex.org/I12708293","https://openalex.org/I202697423","https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101821552","display_name":"Maris Basha","orcid":"https://orcid.org/0000-0002-6855-4590"},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]},{"id":"https://openalex.org/I12708293","display_name":"SIB Swiss Institute of Bioinformatics","ror":"https://ror.org/002n09z45","country_code":"CH","type":"funder","lineage":["https://openalex.org/I12708293"]},{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Maris Basha","raw_affiliation_strings":["University of Zurich and ETH Zurich,Institute of Neuroinformatics,Switzerland","Institute of Neuroinformatics, University of Zurich and ETH Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich and ETH Zurich,Institute of Neuroinformatics,Switzerland","institution_ids":["https://openalex.org/I12708293","https://openalex.org/I202697423","https://openalex.org/I35440088"]},{"raw_affiliation_string":"Institute of Neuroinformatics, University of Zurich and ETH Zurich, Switzerland","institution_ids":["https://openalex.org/I12708293","https://openalex.org/I202697423","https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083202152","display_name":"Sumit Kumar Ram","orcid":"https://orcid.org/0000-0002-5431-5919"},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]},{"id":"https://openalex.org/I4210118033","display_name":"Infrastructure Management Consultants (Switzerland)","ror":"https://ror.org/027r3eh78","country_code":"CH","type":"company","lineage":["https://openalex.org/I4210118033"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Sumit Kumar Ram","raw_affiliation_strings":["University of Zurich,Linguistic Research Infrastructure,Switzerland","Linguistic Research Infrastructure, University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich,Linguistic Research Infrastructure,Switzerland","institution_ids":["https://openalex.org/I4210118033"]},{"raw_affiliation_string":"Linguistic Research Infrastructure, University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032410069","display_name":"Guanghao You","orcid":"https://orcid.org/0000-0003-2434-6076"},"institutions":[{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]},{"id":"https://openalex.org/I4210118033","display_name":"Infrastructure Management Consultants (Switzerland)","ror":"https://ror.org/027r3eh78","country_code":"CH","type":"company","lineage":["https://openalex.org/I4210118033"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Guanghao You","raw_affiliation_strings":["University of Zurich,Linguistic Research Infrastructure,Switzerland","Linguistic Research Infrastructure, University of Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich,Linguistic Research Infrastructure,Switzerland","institution_ids":["https://openalex.org/I4210118033"]},{"raw_affiliation_string":"Linguistic Research Infrastructure, University of Zurich, Switzerland","institution_ids":["https://openalex.org/I202697423"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5031455263","display_name":"Richard H. R. Hahnloser","orcid":"https://orcid.org/0000-0002-4039-7773"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]},{"id":"https://openalex.org/I12708293","display_name":"SIB Swiss Institute of Bioinformatics","ror":"https://ror.org/002n09z45","country_code":"CH","type":"funder","lineage":["https://openalex.org/I12708293"]},{"id":"https://openalex.org/I202697423","display_name":"University of Zurich","ror":"https://ror.org/02crff812","country_code":"CH","type":"education","lineage":["https://openalex.org/I202697423"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Richard H. R. Hahnloser","raw_affiliation_strings":["University of Zurich and ETH Zurich,Institute of Neuroinformatics,Switzerland","Institute of Neuroinformatics, University of Zurich and ETH Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"University of Zurich and ETH Zurich,Institute of Neuroinformatics,Switzerland","institution_ids":["https://openalex.org/I12708293","https://openalex.org/I202697423","https://openalex.org/I35440088"]},{"raw_affiliation_string":"Institute of Neuroinformatics, University of Zurich and ETH Zurich, Switzerland","institution_ids":["https://openalex.org/I12708293","https://openalex.org/I202697423","https://openalex.org/I35440088"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5012125124"],"corresponding_institution_ids":["https://openalex.org/I202697423","https://openalex.org/I4210118033"],"apc_list":null,"apc_paid":null,"fwci":6.1059,"has_fulltext":false,"cited_by_count":17,"citation_normalized_percentile":{"value":0.96714717,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"7505","last_page":"7509"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.8480966687202454},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.7439050674438477},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7244188785552979},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7122806310653687},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.619534432888031},{"id":"https://openalex.org/keywords/offset","display_name":"Offset (computer science)","score":0.6133464574813843},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.40666013956069946},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3281603455543518},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09670442342758179}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.8480966687202454},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.7439050674438477},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7244188785552979},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7122806310653687},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.619534432888031},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.6133464574813843},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.40666013956069946},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3281603455543518},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09670442342758179},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10447620","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447620","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/5","display_name":"Gender equality","score":0.4000000059604645}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W2134908101","https://openalex.org/W2792764867","https://openalex.org/W2803224102","https://openalex.org/W2885307078","https://openalex.org/W2999938474","https://openalex.org/W3016361809","https://openalex.org/W3024599079","https://openalex.org/W3025260599","https://openalex.org/W3161282967","https://openalex.org/W3210468882","https://openalex.org/W4205665640","https://openalex.org/W4211118298","https://openalex.org/W4220801857","https://openalex.org/W4311000453","https://openalex.org/W4313679745","https://openalex.org/W4385289289","https://openalex.org/W4386415600","https://openalex.org/W4394403239","https://openalex.org/W6682542541","https://openalex.org/W6749825310","https://openalex.org/W6847363464","https://openalex.org/W6848190777","https://openalex.org/W6855777900","https://openalex.org/W6863363983","https://openalex.org/W6909043767"],"related_works":["https://openalex.org/W4214896311","https://openalex.org/W1840351222","https://openalex.org/W642007152","https://openalex.org/W2401827384","https://openalex.org/W2355290951","https://openalex.org/W4304187160","https://openalex.org/W2052688117","https://openalex.org/W2552102772","https://openalex.org/W4294771049","https://openalex.org/W1523214805"],"abstract_inverted_index":{"This":[0],"paper":[1],"introduces":[2],"WhisperSeg,":[3],"utilizing":[4],"the":[5,99],"Whisper":[6],"Transformer":[7],"pre-trained":[8],"for":[9,14],"Automatic":[10],"Speech":[11],"Recognition":[12],"(ASR)":[13],"human":[15,28],"and":[16,38,51,59],"animal":[17,31,92],"Voice":[18],"Activity":[19],"Detection":[20],"(VAD).":[21],"Contrary":[22],"to":[23,90],"traditional":[24],"methods":[25],"that":[26],"detect":[27],"voice":[29,62],"or":[30],"vocalizations":[32],"from":[33,77],"a":[34,65,70,84],"short":[35],"audio":[36,50,67],"frame":[37],"rely":[39],"on":[40],"careful":[41],"threshold":[42],"selection,":[43],"WhisperSeg":[44],"processes":[45],"entire":[46],"spectrograms":[47],"of":[48,56,61,87],"long":[49],"generates":[52],"plain":[53],"text":[54],"representations":[55],"onset,":[57],"offset,":[58],"type":[60],"activity.":[63],"Processing":[64],"longer":[66],"context":[68],"with":[69],"larger":[71],"network":[72],"greatly":[73],"improves":[74],"detection":[75,88],"accuracy":[76],"few":[78],"labeled":[79],"examples.":[80],"We":[81],"further":[82],"demonstrate":[83],"positive":[85],"transfer":[86],"performance":[89],"new":[91],"species,":[93],"making":[94],"our":[95],"approach":[96],"viable":[97],"in":[98],"data-scarce":[100],"multi-species":[101],"setting.":[102],"<sup":[103],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[104],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[105]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":2}],"updated_date":"2026-03-15T09:29:46.208133","created_date":"2025-10-10T00:00:00"}
