{"id":"https://openalex.org/W4404740148","doi":"https://doi.org/10.1109/jstsp.2024.3506286","title":"SemantiCodec: An Ultra Low Bitrate Semantic Audio Codec for General Sound","display_name":"SemantiCodec: An Ultra Low Bitrate Semantic Audio Codec for General Sound","publication_year":2024,"publication_date":"2024-11-26","ids":{"openalex":"https://openalex.org/W4404740148","doi":"https://doi.org/10.1109/jstsp.2024.3506286"},"language":"en","primary_location":{"id":"doi:10.1109/jstsp.2024.3506286","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jstsp.2024.3506286","pdf_url":null,"source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089924305","display_name":"Haohe Liu","orcid":"https://orcid.org/0000-0003-1036-7888"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Haohe Liu","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"raw_orcid":"https://orcid.org/0000-0003-1036-7888","affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025827045","display_name":"Xuenan Xu","orcid":"https://orcid.org/0000-0001-8718-1278"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuenan Xu","raw_affiliation_strings":["Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-8718-1278","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100593867","display_name":"Yi Yuan","orcid":"https://orcid.org/0000-0002-6887-0956"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yi Yuan","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"raw_orcid":"https://orcid.org/0000-0002-6887-0956","affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081865665","display_name":"Mengyue Wu","orcid":"https://orcid.org/0000-0002-5599-8707"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyue Wu","raw_affiliation_strings":["Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-5599-8707","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100676721","display_name":"Wenwu Wang","orcid":"https://orcid.org/0000-0002-8393-5703"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wenwu Wang","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"raw_orcid":"https://orcid.org/0000-0002-8393-5703","affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066967599","display_name":"Mark D. Plumbley","orcid":"https://orcid.org/0000-0002-9708-1075"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mark D. Plumbley","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"raw_orcid":"https://orcid.org/0000-0002-9708-1075","affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5089924305"],"corresponding_institution_ids":["https://openalex.org/I28290843"],"apc_list":null,"apc_paid":null,"fwci":9.2069,"has_fulltext":false,"cited_by_count":28,"citation_normalized_percentile":{"value":0.98722367,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":"18","issue":"8","first_page":"1448","last_page":"1461"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9786999821662903,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.7828042507171631},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7313107252120972},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.479771226644516},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.17377358675003052}],"concepts":[{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.7828042507171631},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7313107252120972},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.479771226644516},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.17377358675003052}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/jstsp.2024.3506286","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jstsp.2024.3506286","pdf_url":null,"source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G7697158467","display_name":"AI for Sound","funder_award_id":"EP/T019751/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G8857457899","display_name":null,"funder_award_id":"EP/T019751/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"},{"id":"https://openalex.org/F4320334912","display_name":"Surrey Space Centre, University of Surrey","ror":"https://ror.org/00ks66431"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":72,"referenced_works":["https://openalex.org/W1481955708","https://openalex.org/W1728888090","https://openalex.org/W2030931454","https://openalex.org/W2052666245","https://openalex.org/W2078014983","https://openalex.org/W2223628738","https://openalex.org/W2293634267","https://openalex.org/W2752796333","https://openalex.org/W2765811365","https://openalex.org/W2935711438","https://openalex.org/W2972359262","https://openalex.org/W3015371781","https://openalex.org/W3094502228","https://openalex.org/W3162391496","https://openalex.org/W3198694222","https://openalex.org/W3200409031","https://openalex.org/W3209059054","https://openalex.org/W3210163254","https://openalex.org/W3215615641","https://openalex.org/W4256241733","https://openalex.org/W4307323391","https://openalex.org/W4312933868","https://openalex.org/W4360951492","https://openalex.org/W4372348103","https://openalex.org/W4381786045","https://openalex.org/W4386918931","https://openalex.org/W4387969125","https://openalex.org/W4391020683","https://openalex.org/W4392903114","https://openalex.org/W4392903177","https://openalex.org/W4394625831","https://openalex.org/W4396877837","https://openalex.org/W4400033239","https://openalex.org/W6610566761","https://openalex.org/W6630442970","https://openalex.org/W6633499030","https://openalex.org/W6639363673","https://openalex.org/W6640963894","https://openalex.org/W6678914141","https://openalex.org/W6697040288","https://openalex.org/W6734260513","https://openalex.org/W6736723571","https://openalex.org/W6750665317","https://openalex.org/W6766320909","https://openalex.org/W6779823529","https://openalex.org/W6783713337","https://openalex.org/W6783867762","https://openalex.org/W6795261426","https://openalex.org/W6796464841","https://openalex.org/W6801426729","https://openalex.org/W6802017037","https://openalex.org/W6802805937","https://openalex.org/W6809884996","https://openalex.org/W6809947431","https://openalex.org/W6810940779","https://openalex.org/W6838844135","https://openalex.org/W6840200333","https://openalex.org/W6840815571","https://openalex.org/W6845479124","https://openalex.org/W6847363464","https://openalex.org/W6848208918","https://openalex.org/W6848482659","https://openalex.org/W6848735303","https://openalex.org/W6849105126","https://openalex.org/W6849109464","https://openalex.org/W6849416043","https://openalex.org/W6850625674","https://openalex.org/W6852581948","https://openalex.org/W6853096648","https://openalex.org/W6853249747","https://openalex.org/W6853515095","https://openalex.org/W6853998256"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2964213236","https://openalex.org/W2163719598","https://openalex.org/W3161919736","https://openalex.org/W2387018512","https://openalex.org/W2107680156","https://openalex.org/W4301184752","https://openalex.org/W2288771647"],"abstract_inverted_index":{"Large":[0],"languagemodels":[1],"(LLMs)":[2],"have":[3],"significantly":[4,168,184,196],"advanced":[5],"audio":[6,9,13,25,64,75,108,129,192],"processing":[7],"through":[8],"codecs":[10,29],"that":[11,166,181],"convert":[12],"into":[14,65],"discrete":[15],"tokens,":[16],"enabling":[17],"the":[18,44,116,170],"application":[19],"of":[20,143,153],"language":[21,50],"modelling":[22],"techniques":[23],"to":[24,62,114,127],"data.":[26],"However,":[27],"traditional":[28],"often":[30],"operate":[31],"at":[32,195],"high":[33],"bitrates":[34],"or":[35],"within":[36],"narrow":[37],"domains":[38],"such":[39],"as":[40],"speech":[41],"and":[42,81,110,121,146,160],"lack":[43],"semantic":[45,92,120,186],"clues":[46],"required":[47],"for":[48],"efficient":[49],"modelling.":[51],"Addressing":[52],"these":[53],"challenges,":[54],"we":[55],"introduce":[56],"SemantiCodec,":[57],"a":[58,68,88,91,95,131,151],"novel":[59],"codec":[60,173],"designed":[61],"compress":[63],"fewer":[66],"than":[67,188],"hundred":[69],"tokens":[70],"per":[71,148],"second":[72],"across":[73],"diverse":[74],"types,":[76],"including":[77],"speech,":[78],"general":[79],"sound,":[80],"music,":[82],"without":[83],"compromising":[84],"quality.":[85,176],"SemantiCodec":[86,134,167,182],"features":[87],"dual-encoder":[89],"architecture:":[90],"encoder":[93,113,123],"using":[94,103],"self-supervised":[96],"pre-trained":[97],"Audio":[98],"Masked":[99],"Autoencoder":[100],"(AudioMAE),":[101],"discretized":[102],"k-means":[104],"clustering":[105],"on":[106,174],"extensive":[107],"data,":[109],"an":[111],"acoustic":[112,122],"capture":[115],"remaining":[117],"details.":[118],"The":[119],"outputs":[124],"are":[125],"used":[126],"reconstruct":[128],"via":[130],"diffusion-model-based":[132],"decoder.":[133],"is":[135],"presented":[136],"in":[137],"three":[138],"variants":[139],"with":[140],"token":[141],"rates":[142,156],"25,":[144],"50,":[145],"100":[147],"second,":[149],"supporting":[150],"range":[152],"ultra-low":[154],"bit":[155],"between":[157],"0.31":[158],"kbps":[159],"1.40":[161],"kbps.":[162],"Experimental":[163],"results":[164,178],"demonstrate":[165],"outperforms":[169],"state-of-the-art":[171,191],"Descript":[172],"reconstruction":[175],"Our":[177],"also":[179],"suggest":[180],"contains":[183],"richer":[185],"information":[187],"all":[189],"evaluated":[190],"codecs,":[193],"even":[194],"lower":[197],"bitrates.":[198]},"counts_by_year":[{"year":2026,"cited_by_count":11},{"year":2025,"cited_by_count":13},{"year":2024,"cited_by_count":4}],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
