{"id":"https://openalex.org/W4415708521","doi":"https://doi.org/10.1109/icme59968.2025.11210181","title":"STFTCodec: High-Fidelity Audio Compression through Time-Frequency Domain Representation","display_name":"STFTCodec: High-Fidelity Audio Compression through Time-Frequency Domain Representation","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415708521","doi":"https://doi.org/10.1109/icme59968.2025.11210181"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11210181","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210181","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013882034","display_name":"Feng Tao","orcid":"https://orcid.org/0000-0001-6105-860X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Tao Feng","raw_affiliation_strings":["Tsinghua University,Shenzhen International Graduate School,Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Shenzhen International Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079531245","display_name":"Zhiyuan Zhao","orcid":"https://orcid.org/0000-0002-1227-5058"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhiyuan Zhao","raw_affiliation_strings":["International Digital Economy Academy (IDEA),Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"International Digital Economy Academy (IDEA),Shenzhen,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008040193","display_name":"Yifan Xie","orcid":"https://orcid.org/0000-0003-1906-5253"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yifan Xie","raw_affiliation_strings":["Tsinghua University,Shenzhen International Graduate School,Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Shenzhen International Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111325372","display_name":"Yuqi Ye","orcid":"https://orcid.org/0009-0001-7517-000X"},"institutions":[{"id":"https://openalex.org/I4210128628","display_name":"Peking University Shenzhen Hospital","ror":"https://ror.org/03kkjyb15","country_code":"CN","type":"healthcare","lineage":["https://openalex.org/I4210128628"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuqi Ye","raw_affiliation_strings":["Peking University,Shenzhen Graduate School,Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Shenzhen Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I4210128628"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109423417","display_name":"Xiangyang Luo","orcid":"https://orcid.org/0009-0001-2828-387X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiangyang Luo","raw_affiliation_strings":["Tsinghua University,Shenzhen International Graduate School,Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Shenzhen International Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xun Guan","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xun Guan","raw_affiliation_strings":["Tsinghua University,Shenzhen International Graduate School,Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Shenzhen International Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100404095","display_name":"Yulong Li","orcid":"https://orcid.org/0000-0002-4219-9041"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu Li","raw_affiliation_strings":["International Digital Economy Academy (IDEA),Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"International Digital Economy Academy (IDEA),Shenzhen,China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5013882034"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":1.2783,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.84618141,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.3693999946117401,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.3693999946117401,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.18880000710487366,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11034","display_name":"Digital Filter Design and Implementation","score":0.10779999941587448,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.638700008392334},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5785999894142151},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.5442000031471252},{"id":"https://openalex.org/keywords/short-time-fourier-transform","display_name":"Short-time Fourier transform","score":0.4756999909877777},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.46219998598098755},{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.44699999690055847},{"id":"https://openalex.org/keywords/fourier-transform","display_name":"Fourier transform","score":0.4410000145435333},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.421099990606308},{"id":"https://openalex.org/keywords/transform-coding","display_name":"Transform coding","score":0.39100000262260437},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3887999951839447}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7677000164985657},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.638700008392334},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5785999894142151},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5458999872207642},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.5442000031471252},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5357000231742859},{"id":"https://openalex.org/C166386157","wikidata":"https://www.wikidata.org/wiki/Q1477735","display_name":"Short-time Fourier transform","level":4,"score":0.4756999909877777},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.46219998598098755},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4544999897480011},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.44699999690055847},{"id":"https://openalex.org/C102519508","wikidata":"https://www.wikidata.org/wiki/Q6520159","display_name":"Fourier transform","level":2,"score":0.4410000145435333},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.421099990606308},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4032999873161316},{"id":"https://openalex.org/C169805256","wikidata":"https://www.wikidata.org/wiki/Q1361381","display_name":"Transform coding","level":4,"score":0.39100000262260437},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3887999951839447},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.383899986743927},{"id":"https://openalex.org/C44280652","wikidata":"https://www.wikidata.org/wiki/Q104837","display_name":"Phase (matter)","level":2,"score":0.3727000057697296},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.36910000443458557},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3626999855041504},{"id":"https://openalex.org/C19118579","wikidata":"https://www.wikidata.org/wiki/Q786423","display_name":"Frequency domain","level":2,"score":0.3517000079154968},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.34049999713897705},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.328900009393692},{"id":"https://openalex.org/C150178126","wikidata":"https://www.wikidata.org/wiki/Q18433212","display_name":"Dynamic range compression","level":2,"score":0.3248000144958496},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.320499986410141},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3199000060558319},{"id":"https://openalex.org/C2221639","wikidata":"https://www.wikidata.org/wiki/Q2877","display_name":"Discrete cosine transform","level":3,"score":0.31779998540878296},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.29919999837875366},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.28940001130104065},{"id":"https://openalex.org/C25797200","wikidata":"https://www.wikidata.org/wiki/Q828137","display_name":"Compression ratio","level":3,"score":0.28360000252723694},{"id":"https://openalex.org/C28726691","wikidata":"https://www.wikidata.org/wiki/Q1268231","display_name":"Modified discrete cosine transform","level":5,"score":0.2827000021934509},{"id":"https://openalex.org/C2778192920","wikidata":"https://www.wikidata.org/wiki/Q16874989","display_name":"Signal compression","level":4,"score":0.274399995803833},{"id":"https://openalex.org/C142433447","wikidata":"https://www.wikidata.org/wiki/Q7806653","display_name":"Time\u2013frequency analysis","level":3,"score":0.273499995470047},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.25279998779296875}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11210181","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210181","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W1481955708","https://openalex.org/W1552314771","https://openalex.org/W2141998673","https://openalex.org/W2194775991","https://openalex.org/W2935711438","https://openalex.org/W2972359262","https://openalex.org/W3037038648","https://openalex.org/W3215615641","https://openalex.org/W4225956675","https://openalex.org/W4372260101","https://openalex.org/W4372270198","https://openalex.org/W4375869436","https://openalex.org/W4386076493","https://openalex.org/W4399875170","https://openalex.org/W4406461488","https://openalex.org/W4406461725"],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"STFTCodec,":[2],"a":[3],"novel":[4],"spectral-based":[5,85],"neural":[6],"audio":[7,12],"codec":[8],"that":[9,21,79],"efficiently":[10],"compresses":[11],"using":[13],"Short-Time":[14],"Fourier":[15],"Transform":[16],"(STFT).":[17],"Unlike":[18],"waveform-based":[19,83],"approaches":[20,86],"require":[22],"large":[23],"model":[24],"capacity":[25],"and":[26,38,51,84],"substantial":[27],"memory":[28],"consumption,":[29],"this":[30],"method":[31],"leverages":[32],"STFT":[33,99],"for":[34],"compact":[35],"spectral":[36],"representation":[37],"introduces":[39],"unwrapped":[40],"phase":[41,52,64],"derivatives":[42],"as":[43],"auxiliary":[44],"features.":[45],"Our":[46],"architecture":[47],"employs":[48],"parallel":[49],"magnitude":[50],"processing":[53],"branches":[54],"enhanced":[55],"by":[56],"advanced":[57],"feature":[58],"extraction":[59],"mechanisms.":[60],"By":[61],"relaxing":[62],"strict":[63],"reconstruction":[65],"constraints":[66],"while":[67,90],"maintaining":[68],"phase-aware":[69],"processing,":[70],"we":[71],"achieve":[72],"superior":[73],"perceptual":[74],"quality.":[75],"Experimental":[76],"results":[77],"demonstrate":[78],"STFTCodec":[80],"outperforms":[81],"both":[82],"across":[87],"multiple":[88],"bitrates,":[89],"offering":[91],"unique":[92],"flexibility":[93],"in":[94],"compression":[95],"ratio":[96],"adjustment":[97],"through":[98],"parameter":[100],"modification":[101],"without":[102],"architectural":[103],"changes.":[104]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-30T00:00:00"}
