{"id":"https://openalex.org/W4411799853","doi":"https://doi.org/10.1109/icmcis64378.2025.11047823","title":"Discrete Audio Representations from SoundStream: A Dual Approach to Efficient Transmission and Speech Detection","display_name":"Discrete Audio Representations from SoundStream: A Dual Approach to Efficient Transmission and Speech Detection","publication_year":2025,"publication_date":"2025-05-13","ids":{"openalex":"https://openalex.org/W4411799853","doi":"https://doi.org/10.1109/icmcis64378.2025.11047823"},"language":"en","primary_location":{"id":"doi:10.1109/icmcis64378.2025.11047823","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icmcis64378.2025.11047823","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Military Communication and Information Systems (ICMCIS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5003409945","display_name":"Fahrettin G\u00f6kg\u00f6z","orcid":null},"institutions":[{"id":"https://openalex.org/I4210166245","display_name":"Fraunhofer Institute for Communication, Information Processing and Ergonomics","ror":"https://ror.org/05nn0gw40","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210166245","https://openalex.org/I4923324"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Fahrettin G\u00f6kg\u00f6z","raw_affiliation_strings":["Fraunhofer FKIE,Wachtberg,Germany,53343"],"affiliations":[{"raw_affiliation_string":"Fraunhofer FKIE,Wachtberg,Germany,53343","institution_ids":["https://openalex.org/I4210166245"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047703641","display_name":"Hesham Ali","orcid":"https://orcid.org/0000-0001-6675-7987"},"institutions":[{"id":"https://openalex.org/I4210166245","display_name":"Fraunhofer Institute for Communication, Information Processing and Ergonomics","ror":"https://ror.org/05nn0gw40","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210166245","https://openalex.org/I4923324"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Hisham Ali","raw_affiliation_strings":["Fraunhofer FKIE,Wachtberg,Germany,53343"],"affiliations":[{"raw_affiliation_string":"Fraunhofer FKIE,Wachtberg,Germany,53343","institution_ids":["https://openalex.org/I4210166245"]}]},{"author_position":"last","author":{"id":null,"display_name":"Priya Pal","orcid":null},"institutions":[{"id":"https://openalex.org/I4210166245","display_name":"Fraunhofer Institute for Communication, Information Processing and Ergonomics","ror":"https://ror.org/05nn0gw40","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210166245","https://openalex.org/I4923324"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Priya Pal","raw_affiliation_strings":["Fraunhofer FKIE,Wachtberg,Germany,53343"],"affiliations":[{"raw_affiliation_string":"Fraunhofer FKIE,Wachtberg,Germany,53343","institution_ids":["https://openalex.org/I4210166245"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5003409945"],"corresponding_institution_ids":["https://openalex.org/I4210166245"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.17482983,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7294687628746033},{"id":"https://openalex.org/keywords/dual","display_name":"Dual (grammatical number)","score":0.6679244637489319},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.5349000692367554},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5316852927207947},{"id":"https://openalex.org/keywords/transmission","display_name":"Transmission (telecommunications)","score":0.4867872893810272},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4370131492614746},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3968549966812134},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.17670515179634094}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7294687628746033},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.6679244637489319},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.5349000692367554},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5316852927207947},{"id":"https://openalex.org/C761482","wikidata":"https://www.wikidata.org/wiki/Q118093","display_name":"Transmission (telecommunications)","level":2,"score":0.4867872893810272},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4370131492614746},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3968549966812134},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.17670515179634094},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C124952713","wikidata":"https://www.wikidata.org/wiki/Q8242","display_name":"Literature","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icmcis64378.2025.11047823","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icmcis64378.2025.11047823","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Military Communication and Information Systems (ICMCIS)","raw_type":"proceedings-article"},{"id":"pmh:oai:publica.fraunhofer.de:publica/494708","is_oa":false,"landing_page_url":"https://publica.fraunhofer.de/handle/publica/494708","pdf_url":null,"source":{"id":"https://openalex.org/S4306400318","display_name":"Fraunhofer-Publica (Fraunhofer-Gesellschaft)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4923324","host_organization_name":"Fraunhofer-Gesellschaft","host_organization_lineage":["https://openalex.org/I4923324"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1552853484","https://openalex.org/W1970491336","https://openalex.org/W2067295501","https://openalex.org/W2106097867","https://openalex.org/W2119564743","https://openalex.org/W2486436843","https://openalex.org/W2911964244","https://openalex.org/W2939156090","https://openalex.org/W3018265077","https://openalex.org/W3025844872","https://openalex.org/W3127686677","https://openalex.org/W3161480375","https://openalex.org/W3196475561","https://openalex.org/W3215615641","https://openalex.org/W4296906760","https://openalex.org/W4399372393","https://openalex.org/W4405103370","https://openalex.org/W6804017046","https://openalex.org/W6853515095","https://openalex.org/W7010488078"],"related_works":["https://openalex.org/W191108438","https://openalex.org/W3135230428","https://openalex.org/W2496295964","https://openalex.org/W1911859126","https://openalex.org/W2541680182","https://openalex.org/W642007152","https://openalex.org/W2131711534","https://openalex.org/W2559040841","https://openalex.org/W114661351","https://openalex.org/W2056066842"],"abstract_inverted_index":{"This":[0,136,160],"paper":[1,161],"investigates":[2],"the":[3,65,91,101,107,129,139,166,176,187],"application":[4],"of":[5,32,94,103,141],"Sound-Stream,":[6],"a":[7,51],"state-of-the-art":[8],"neural":[9],"audio":[10,15,41,46,120,146],"codec,":[11],"to":[12,37,59,117,144],"achieve":[13],"efficient":[14],"transmission":[16],"and":[17,90,122,156,169,182],"effective":[18],"speech":[19,62,95,134],"detection":[20,55],"in":[21,148,152,190],"resource-constrained":[22],"environments.":[23],"We":[24],"analyze":[25],"SoundStream's":[26,115],"architecture,":[27],"emphasizing":[28],"its":[29],"innovative":[30],"use":[31],"Residual":[33],"Vector":[34],"Quantization":[35],"(RVQ)":[36],"create":[38],"compact,":[39],"discrete":[40],"representations":[42],"while":[43,128],"preserving":[44],"essential":[45],"features.":[47],"Additionally,":[48],"we":[49,99],"introduce":[50],"novel":[52],"voice":[53],"activity":[54],"(VAD)":[56],"algorithm":[57,131],"designed":[58],"identify":[60],"relevant":[61],"segments":[63],"within":[64],"transmitted":[66],"audio.":[67],"Our":[68],"evaluation":[69],"employs":[70],"objective":[71],"metrics,":[72],"including":[73],"Deep":[74],"Noise":[75],"Suppression":[76],"Mean":[77],"Opinion":[78],"Score":[79],"(DNSMOS),":[80],"Non-Intrusive":[81],"Speech":[82],"Quality":[83],"Assessment":[84],"(NISQA),":[85],"Short-Time":[86],"Objective":[87],"Intelligibility":[88],"(STOI),":[89],"density":[92],"distribution":[93],"over":[96],"codebooks.":[97],"Furthermore,":[98],"assess":[100],"performance":[102],"our":[104],"VAD":[105,130],"using":[106],"<tex":[108],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[109],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$\\mathrm{F}$</tex>":[110],"-measure.":[111],"The":[112],"results":[113],"demonstrate":[114],"capability":[116],"maintain":[118],"high":[119],"fidelity":[121],"intelligibility":[123],"despite":[124],"varying":[125],"encoding":[126],"stages,":[127],"effectively":[132],"ensures":[133],"detection.":[135],"study":[137],"highlights":[138],"potential":[140],"these":[142],"methodologies":[143],"enhance":[145],"processing":[147],"diverse":[149],"applications,":[150],"particularly":[151],"scenarios":[153],"where":[154],"bandwidth":[155],"clarity":[157],"are":[158],"critical.":[159],"was":[162],"originally":[163],"presented":[164],"at":[165],"NATO":[167],"Science":[168],"Technology":[170,179],"Organization":[171],"Symposium":[172],"(ICMCIS)":[173],"organized":[174],"by":[175],"Information":[177],"Systems":[178],"(IST)":[180],"Scientific":[181],"Technical":[183],"Committee,":[184],"IST-209-RSY":[185],"-":[186],"ICMCIS,":[188],"held":[189],"Oeiras,":[191],"Portugal,":[192],"13\u201314":[193],"May":[194],"2025.\u201c":[195]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
