{"id":"https://openalex.org/W7108700533","doi":"https://doi.org/10.5281/zenodo.17811403","title":"CoDiCodec: Unifying Continuous and Discrete Compressed Representations of Audio","display_name":"CoDiCodec: Unifying Continuous and Discrete Compressed Representations of Audio","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7108700533","doi":"https://doi.org/10.5281/zenodo.17811403"},"language":null,"primary_location":{"id":"doi:10.5281/zenodo.17811403","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811403","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.17811403","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Marco Pasini","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Marco Pasini","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Stefan Lattner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stefan Lattner","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"George Fazekas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"George Fazekas","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.60113585,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8730000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8730000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.025100000202655792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.01720000058412552,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.6183000206947327},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.5059000253677368},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.46459999680519104},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.45879998803138733},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4408000111579895},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.41830000281333923},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.41350001096725464},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4011000096797943},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.4004000127315521}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6966999769210815},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.6183000206947327},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5472000241279602},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.5059000253677368},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.46459999680519104},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.45879998803138733},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4408000111579895},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.41830000281333923},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.41350001096725464},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4011000096797943},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.4004000127315521},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.39820000529289246},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.3928999900817871},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.3928999900817871},{"id":"https://openalex.org/C124851039","wikidata":"https://www.wikidata.org/wiki/Q2665459","display_name":"Compressed sensing","level":2,"score":0.3662000000476837},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.36579999327659607},{"id":"https://openalex.org/C2221639","wikidata":"https://www.wikidata.org/wiki/Q2877","display_name":"Discrete cosine transform","level":3,"score":0.3472000062465668},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.3407000005245209},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.33329999446868896},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.3294000029563904},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32350000739097595},{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.3124000132083893},{"id":"https://openalex.org/C94835093","wikidata":"https://www.wikidata.org/wiki/Q3113333","display_name":"Data compression ratio","level":5,"score":0.2822999954223633},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.2777999937534332},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.27000001072883606},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.2639999985694885},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C10184394","wikidata":"https://www.wikidata.org/wiki/Q5165491","display_name":"Continuous modelling","level":2,"score":0.2596000134944916},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.17811403","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811403","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.17811403","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811403","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Efficiently":[0],"representing":[1],"audio":[2,35,45,138,158,167],"signals":[3],"in":[4,154],"a":[5,20,38,43,74,101,128,163],"compressed":[6,64],"latent":[7,12],"space":[8],"is":[9,93],"critical":[10],"for":[11,87,118],"generative":[13,90,176],"modelling.":[14],"However,":[15],"existing":[16,146],"autoencoders":[17,150],"often":[18],"force":[19],"choice":[21],"between":[22,172],"continuous":[23,65,147,173],"embeddings":[24,66],"and":[25,60,70,100,105,127,140,148,174],"discrete":[26,71,149,175],"tokens.":[27],"Furthermore,":[28],"achieving":[29,136],"high":[30],"compression":[31],"ratios":[32],"while":[33],"maintaining":[34],"fidelity":[36],"remains":[37],"challenge.":[39],"This":[40,92],"paper":[41],"introduces":[42],"novel":[44,102,129],"autoencoder":[46],"that":[47],"overcomes":[48],"these":[49],"limitations":[50],"by":[51,61],"both":[52,63,124],"efficiently":[53],"encoding":[54],"global":[55],"features":[56],"via":[57],"summary":[58],"embeddings,":[59],"producing":[62],"at":[67,73,151],"~11":[68],"Hz":[69],"tokens":[72],"rate":[75],"of":[76,156],"2.38":[77],"kbps":[78],"from":[79],"the":[80,113,134,170],"same":[81],"trained":[82],"model,":[83],"offering":[84],"unprecedented":[85],"flexibility":[86],"different":[88],"downstream":[89],"tasks.":[91],"achieved":[94],"through":[95],"Finite":[96],"Scalar":[97],"Quantization":[98],"(FSQ)":[99],"FSQ-dropout":[103],"technique,":[104],"does":[106],"not":[107],"require":[108],"additional":[109],"loss":[110,116],"terms":[111,155],"beyond":[112],"single":[114],"consistency":[115],"used":[117],"end-to-end":[119],"training.":[120],"Our":[121,143,160],"model":[122,144],"supports":[123],"autoregressive":[125],"decoding":[126,131],"parallel":[130],"strategy,":[132],"with":[133],"latter":[135],"superior":[137],"quality":[139],"faster":[141],"decoding.":[142],"outperforms":[145],"similar":[152],"bitrates":[153],"reconstruction":[157],"quality.":[159],"work":[161],"enables":[162],"unified":[164],"approach":[165],"to":[166],"compression,":[168],"bridging":[169],"gap":[171],"modelling":[177],"paradigms.":[178]},"counts_by_year":[],"updated_date":"2025-12-05T23:25:22.460635","created_date":"2025-12-05T00:00:00"}
