{"id":"https://openalex.org/W4403127074","doi":"https://doi.org/10.1109/iwaenc61483.2024.10694300","title":"High-Fidelity Diffusion-Based Audio Codec","display_name":"High-Fidelity Diffusion-Based Audio Codec","publication_year":2024,"publication_date":"2024-09-09","ids":{"openalex":"https://openalex.org/W4403127074","doi":"https://doi.org/10.1109/iwaenc61483.2024.10694300"},"language":"en","primary_location":{"id":"doi:10.1109/iwaenc61483.2024.10694300","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iwaenc61483.2024.10694300","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 18th International Workshop on Acoustic Signal Enhancement (IWAENC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049952528","display_name":"Zhengpu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhengpu Zhang","raw_affiliation_strings":["ByteDance,Streaming Audio IA,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance,Streaming Audio IA,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012866533","display_name":"Jianyuan Feng","orcid":"https://orcid.org/0000-0001-6330-5918"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jianyuan Feng","raw_affiliation_strings":["ByteDance,Data Speech,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance,Data Speech,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006037669","display_name":"Yongjian Mao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yongjian Mao","raw_affiliation_strings":["ByteDance,Data Speech,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance,Data Speech,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110371638","display_name":"Yehang Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yehang Zhu","raw_affiliation_strings":["ByteDance,Streaming Audio IA,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance,Streaming Audio IA,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068988191","display_name":"Junjie Shi","orcid":"https://orcid.org/0000-0002-0710-2865"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junjie Shi","raw_affiliation_strings":["ByteDance,Streaming Audio IA,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance,Streaming Audio IA,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019501293","display_name":"Xuzhou Ye","orcid":"https://orcid.org/0009-0002-7160-6360"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xuzhou Ye","raw_affiliation_strings":["ByteDance,Streaming Audio IA,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance,Streaming Audio IA,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100690915","display_name":"Shilei Liu","orcid":"https://orcid.org/0000-0002-9053-8458"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shilei Liu","raw_affiliation_strings":["ByteDance,Streaming Audio IA,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance,Streaming Audio IA,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004057129","display_name":"Derong Liu","orcid":"https://orcid.org/0000-0003-3715-4778"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Derong Liu","raw_affiliation_strings":["ByteDance,Streaming Audio IA,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance,Streaming Audio IA,China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018343941","display_name":"Chuanzeng Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chuanzeng Huang","raw_affiliation_strings":["ByteDance,Streaming Audio IA,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance,Streaming Audio IA,China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5049952528"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.4652,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.64211039,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"344","last_page":"348"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9810000061988831,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9810000061988831,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.8031816482543945},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7264125347137451},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.6918829679489136},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.5381221771240234},{"id":"https://openalex.org/keywords/adaptive-multi-rate-audio-codec","display_name":"Adaptive Multi-Rate audio codec","score":0.5213652849197388},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.47769811749458313},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.32013410329818726},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.17850583791732788},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.13751128315925598},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.09540727734565735},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08643487095832825},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.07941541075706482}],"concepts":[{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.8031816482543945},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7264125347137451},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.6918829679489136},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.5381221771240234},{"id":"https://openalex.org/C177067256","wikidata":"https://www.wikidata.org/wiki/Q4676210","display_name":"Adaptive Multi-Rate audio codec","level":4,"score":0.5213652849197388},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.47769811749458313},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32013410329818726},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.17850583791732788},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.13751128315925598},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.09540727734565735},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08643487095832825},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.07941541075706482},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iwaenc61483.2024.10694300","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iwaenc61483.2024.10694300","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 18th International Workshop on Acoustic Signal Enhancement (IWAENC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W2143612128","https://openalex.org/W2178928294","https://openalex.org/W2760103357","https://openalex.org/W2775336875","https://openalex.org/W2785562966","https://openalex.org/W2935711438","https://openalex.org/W2963091184","https://openalex.org/W3037038648","https://openalex.org/W3084645150","https://openalex.org/W3144575004","https://openalex.org/W3163662330","https://openalex.org/W3215615641","https://openalex.org/W4281820413","https://openalex.org/W4284972581","https://openalex.org/W4307323391","https://openalex.org/W4372270198","https://openalex.org/W4375869380","https://openalex.org/W4386764371","https://openalex.org/W4392903887","https://openalex.org/W6634817459","https://openalex.org/W6679045638","https://openalex.org/W6744762798","https://openalex.org/W6765775151","https://openalex.org/W6766320909","https://openalex.org/W6779823529","https://openalex.org/W6782380777","https://openalex.org/W6786375611","https://openalex.org/W6796762324","https://openalex.org/W6852581948","https://openalex.org/W6853515095"],"related_works":["https://openalex.org/W4252424250","https://openalex.org/W2370747337","https://openalex.org/W1890500690","https://openalex.org/W2157819213","https://openalex.org/W2205640666","https://openalex.org/W2133351881","https://openalex.org/W2382399415","https://openalex.org/W2369511196","https://openalex.org/W2163719598","https://openalex.org/W4241950017"],"abstract_inverted_index":{"The":[0,69,121],"compression":[1,42,91],"of":[2,18,29,49,78,117],"audio":[3,10,50,67,90,138],"signals":[4],"plays":[5],"a":[6,26,57,63,96,105],"crucial":[7],"role":[8,77],"in":[9,82],"storage":[11],"and":[12,72,80],"transmission,":[13],"particularly":[14],"within":[15],"the":[16,41,47,76,84,101,108,118,131],"context":[17],"streaming":[19],"media":[20],"applications,":[21],"where":[22],"bandwidth":[23],"utilization":[24],"is":[25,37,62,111,127],"dominant":[27],"factor":[28],"cost.":[30],"Motivated":[31],"by":[32],"this":[33,53],"challenge,":[34],"our":[35],"objective":[36],"to":[38,99,112,145],"continuously":[39],"enhance":[40],"rate":[43],"while":[44],"simultaneously":[45],"ensuring":[46],"retention":[48],"quality.":[51],"In":[52],"paper,":[54],"we":[55,94],"present":[56],"diffusion-based":[58],"codec":[59],"sDiff-Codec,":[60,83],"which":[61],"state-of-the-art,":[64],"high-fidelity":[65],"neural":[66],"codec.":[68],"condition":[70],"module":[71,74],"generator":[73],"serve":[75],"encoder":[79],"decoder":[81],"sound":[85],"quality":[86],"enhancement":[87],"task":[88],"becomes":[89],"task.":[92],"Additionally,":[93],"employed":[95],"hybrid":[97],"quantizer":[98],"quantize":[100],"latent":[102],"information":[103,116],"using":[104],"hyper-prior":[106,109],"model,":[107],"model":[110],"generate":[113],"prior":[114],"auxiliary":[115],"entropy":[119],"model.":[120],"experiment":[122],"results":[123],"show":[124],"that":[125],"sDiff-Codec":[126],"superior":[128],"compared":[129],"with":[130],"baseline":[132],"methods":[133],"under":[134],"scenarios":[135],"when":[136],"monophonic":[137],"signal":[139],"bitrate":[140],"ranges":[141],"from":[142],"16":[143],"kbps":[144],"192":[146],"kbps.":[147]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-10T00:00:00"}
