{"id":"https://openalex.org/W4416250953","doi":"https://doi.org/10.1109/waspaa66052.2025.11230977","title":"Learning to Upsample and Upmix Audio in the Latent Domain","display_name":"Learning to Upsample and Upmix Audio in the Latent Domain","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416250953","doi":"https://doi.org/10.1109/waspaa66052.2025.11230977"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11230977","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230977","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041168295","display_name":"Dimitrios Bralios","orcid":null},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Dimitrios Bralios","raw_affiliation_strings":["University of Illinois Urbana-Champaign,Urbana,IL,USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign,Urbana,IL,USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038903729","display_name":"Paris Smaragdis","orcid":null},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Paris Smaragdis","raw_affiliation_strings":["University of Illinois Urbana-Champaign,Urbana,IL,USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign,Urbana,IL,USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5014125577","display_name":"Jonah Casebeer","orcid":"https://orcid.org/0000-0002-8741-9773"},"institutions":[{"id":"https://openalex.org/I1306409833","display_name":"Adobe Systems (United States)","ror":"https://ror.org/059tvcg64","country_code":"US","type":"company","lineage":["https://openalex.org/I1306409833"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jonah Casebeer","raw_affiliation_strings":["Adobe Research,San Francisco,CA,USA"],"affiliations":[{"raw_affiliation_string":"Adobe Research,San Francisco,CA,USA","institution_ids":["https://openalex.org/I1306409833"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5041168295"],"corresponding_institution_ids":["https://openalex.org/I157725225"],"apc_list":null,"apc_paid":null,"fwci":2.2732,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.90496277,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.36890000104904175,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.36890000104904175,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.3034000098705292,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.07410000264644623,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.47290000319480896},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.46230000257492065},{"id":"https://openalex.org/keywords/sound-quality","display_name":"Sound quality","score":0.43459999561309814},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.37389999628067017},{"id":"https://openalex.org/keywords/digital-audio","display_name":"Digital audio","score":0.36660000681877136},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3580999970436096},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.3188999891281128},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.31630000472068787}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8044000267982483},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.47290000319480896},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.46230000257492065},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4611000120639801},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.45100000500679016},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.43459999561309814},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.37389999628067017},{"id":"https://openalex.org/C87687168","wikidata":"https://www.wikidata.org/wiki/Q173114","display_name":"Digital audio","level":4,"score":0.36660000681877136},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3580999970436096},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3188999891281128},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.31630000472068787},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.3149000108242035},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.30730000138282776},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3003999888896942},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.298799991607666},{"id":"https://openalex.org/C132964779","wikidata":"https://www.wikidata.org/wiki/Q2110223","display_name":"Raw data","level":2,"score":0.2913999855518341},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2867000102996826},{"id":"https://openalex.org/C2780440489","wikidata":"https://www.wikidata.org/wiki/Q5227278","display_name":"Data-driven","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.27799999713897705},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.27549999952316284},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11230977","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230977","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W2593414223","https://openalex.org/W3160576174","https://openalex.org/W3215615641","https://openalex.org/W4221167052","https://openalex.org/W4285412734","https://openalex.org/W4372260387","https://openalex.org/W4381786045","https://openalex.org/W4385822255","https://openalex.org/W4386076493","https://openalex.org/W4388691923","https://openalex.org/W4392903177","https://openalex.org/W4396877837","https://openalex.org/W4400903650","https://openalex.org/W4402112643","https://openalex.org/W4402115964","https://openalex.org/W4404787821","https://openalex.org/W4406861173","https://openalex.org/W4408345930","https://openalex.org/W4408351887","https://openalex.org/W4408354268","https://openalex.org/W4408354918","https://openalex.org/W4408355354"],"related_works":[],"abstract_inverted_index":{"Neural":[0],"audio":[1,19,35,66,82,159,175],"autoencoders":[2],"create":[3],"compact":[4],"latent":[5,29,73,94,98,106],"representations":[6,52],"that":[7,64,115,162],"preserve":[8],"perceptually":[9],"important":[10],"information,":[11],"serving":[12],"as":[13,39],"the":[14,76,93],"foundation":[15],"for":[16,158],"both":[17],"modern":[18],"compression":[20],"systems":[21],"and":[22,28,41,123,130,169],"generation":[23],"approaches":[24],"like":[25],"next-token":[26],"prediction":[27],"diffusion.":[30],"Despite":[31],"their":[32],"prevalence,":[33],"most":[34],"processing":[36,67,160],"operations,":[37],"such":[38],"spatial":[40],"spectral":[42,51],"up-sampling,":[43],"still":[44],"inefficiently":[45],"operate":[46],"on":[47,56,148],"raw":[48,81,149],"waveforms":[49],"or":[50],"rather":[53],"than":[54],"directly":[55],"these":[57],"compressed":[58],"representations.":[59],"We":[60],"propose":[61],"a":[62,97,104,154],"framework":[63],"performs":[65],"operations":[68],"entirely":[69],"within":[70],"an":[71],"autoencoder\u2019s":[72],"space,":[74],"eliminating":[75],"need":[77],"to":[78,80,140,146],"decode":[79],"formats.":[83],"Our":[84],"approach":[85],"dramatically":[86],"simplifies":[87],"training":[88],"by":[89,103],"operating":[90],"solely":[91],"in":[92,127],"domain,":[95],"with":[96,112],"L1":[99],"reconstruction":[100],"term,":[101],"augmented":[102],"single":[105],"adversarial":[107],"discriminator.":[108],"This":[109,151],"contrasts":[110],"sharply":[111],"raw-audio":[113],"methods":[114],"typically":[116],"require":[117],"complex":[118],"combinations":[119],"of":[120,138],"multi-scale":[121],"losses":[122],"discriminators.":[124],"Through":[125],"experiments":[126],"bandwidth":[128],"extension":[129],"mono-to-stereo":[131],"upmixing,":[132],"we":[133],"demonstrate":[134],"computational":[135],"efficiency":[136],"gains":[137],"up":[139],"100\u00d7":[141],"while":[142],"maintaining":[143],"quality":[144],"comparable":[145],"post-processing":[147],"audio.":[150],"work":[152],"establishes":[153],"more":[155,170],"efficient":[156],"paradigm":[157],"pipelines":[161],"already":[163],"incorporate":[164],"autoencoders,":[165],"enabling":[166],"significantly":[167],"faster":[168],"resource-efficient":[171],"workflows":[172],"across":[173],"various":[174],"tasks.":[176]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-11-14T00:00:00"}
