{"id":"https://openalex.org/W7134854311","doi":"https://doi.org/10.48550/arxiv.2603.07708","title":"VoiceSHIELD-Small: Real-Time Malicious Speech Detection and Transcription","display_name":"VoiceSHIELD-Small: Real-Time Malicious Speech Detection and Transcription","publication_year":2026,"publication_date":"2026-03-08","ids":{"openalex":"https://openalex.org/W7134854311","doi":"https://doi.org/10.48550/arxiv.2603.07708"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.07708","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128650054","display_name":"Sumit Ranjan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ranjan, Sumit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128648463","display_name":"Sugandha Sharma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sharma, Sugandha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120769238","display_name":"Ubaid Abbas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abbas, Ubaid","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128629959","display_name":"Puneeth N Ail","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ail, Puneeth N","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.7745000123977661,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.7745000123977661,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.03629999980330467,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.019099999219179153,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.652899980545044},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.6104999780654907},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4512999951839447},{"id":"https://openalex.org/keywords/license","display_name":"License","score":0.39910000562667847},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.38260000944137573},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.357699990272522}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7283999919891357},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6937000155448914},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.652899980545044},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.6104999780654907},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4512999951839447},{"id":"https://openalex.org/C2780560020","wikidata":"https://www.wikidata.org/wiki/Q79719","display_name":"License","level":2,"score":0.39910000562667847},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.38260000944137573},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.357699990272522},{"id":"https://openalex.org/C182964821","wikidata":"https://www.wikidata.org/wiki/Q7939498","display_name":"Voice analysis","level":2,"score":0.3522999882698059},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3107999861240387},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.301800012588501},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2581000030040741}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.07708","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.07708","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.07708","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.07708","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.7169317007064819,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Voice":[0],"interfaces":[1],"are":[2],"quickly":[3],"becoming":[4],"a":[5,58,89,93,117],"common":[6],"way":[7],"for":[8],"people":[9],"to":[10,38,102,178],"interact":[11],"with":[12],"AI":[13,186],"systems.":[14],"This":[15,54],"also":[16,158],"brings":[17],"new":[18],"security":[19,32],"risks,":[20],"such":[21],"as":[22],"prompt":[23],"injection,":[24],"social":[25],"engineering,":[26],"and":[27,40,48,70,92,130,167,182],"harmful":[28,145],"voice":[29,185],"commands.":[30],"Traditional":[31],"methods":[33],"rely":[34],"on":[35,83,105,116],"converting":[36],"speech":[37,69],"text":[39],"then":[41],"filtering":[42],"that":[43,61],"text,":[44],"which":[45],"introduces":[46,56],"delays":[47],"can":[49,67],"ignore":[50],"important":[51],"audio":[52,104,122],"cues.":[53],"paper":[55,157],"VoiceSHIELD-Small,":[57],"lightweight":[59],"model":[60,125],"works":[62],"in":[63,79,184],"real":[64],"time.":[65,114],"It":[66,97],"transcribe":[68],"detect":[71],"whether":[72],"it":[73,140],"is":[74,172],"safe":[75],"or":[76],"harmful,":[77],"all":[78],"one":[80],"step.":[81],"Built":[82],"OpenAI's":[84],"Whisper-small":[85],"encoder,":[86],"VoiceSHIELD":[87,171],"adds":[88],"mean-pooling":[90],"layer":[91],"simple":[94],"classification":[95],"head.":[96],"takes":[98],"just":[99],"90-120":[100],"milliseconds":[101],"classify":[103],"mid-tier":[106],"GPUs,":[107],"while":[108],"transcription":[109],"happens":[110],"at":[111],"the":[112,124,137,160,175],"same":[113],"Tested":[115],"balanced":[118],"set":[119],"of":[120,134,144],"947":[121],"clips,":[123],"achieved":[126],"99.16":[127],"percent":[128,143],"accuracy":[129],"an":[131],"F1":[132],"score":[133],"0.9865.":[135],"At":[136],"default":[138],"setting,":[139],"missed":[141],"2.33":[142],"inputs.":[146],"Cross-validation":[147],"showed":[148],"consistent":[149],"performance":[150,165],"(F1":[151],"standard":[152],"deviation":[153],"=":[154],"0.0026).":[155],"The":[156],"covers":[159],"model's":[161],"design,":[162],"training":[163],"data,":[164],"trade-offs,":[166],"responsible":[168],"use":[169],"guidelines.":[170],"released":[173],"under":[174],"MIT":[176],"license":[177],"encourage":[179],"further":[180],"research":[181],"adoption":[183],"security.":[187]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-11T00:00:00"}
