{"id":"https://openalex.org/W4408352114","doi":"https://doi.org/10.1109/icassp49660.2025.10889311","title":"ImmerseDiffusion: A Generative Spatial Audio Latent Diffusion Model","display_name":"ImmerseDiffusion: A Generative Spatial Audio Latent Diffusion Model","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408352114","doi":"https://doi.org/10.1109/icassp49660.2025.10889311"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10889311","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889311","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Mojtaba Heydari","orcid":null},"institutions":[{"id":"https://openalex.org/I5388228","display_name":"University of Rochester","ror":"https://ror.org/022kthw22","country_code":"US","type":"education","lineage":["https://openalex.org/I5388228"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Mojtaba Heydari","raw_affiliation_strings":["University of Rochester"],"affiliations":[{"raw_affiliation_string":"University of Rochester","institution_ids":["https://openalex.org/I5388228"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041575981","display_name":"Mehrez Souden","orcid":null},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mehrez Souden","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114534262","display_name":"Bruno Conejo","orcid":null},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Bruno Conejo","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5059482778","display_name":"Joshua Atkins","orcid":"https://orcid.org/0000-0003-4099-5375"},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Joshua Atkins","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I5388228"],"apc_list":null,"apc_paid":null,"fwci":7.6696,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.97179566,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9629999995231628,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7107030153274536},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5435850024223328},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.48341596126556396},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.45642468333244324},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3753593862056732},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.35063111782073975}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7107030153274536},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5435850024223328},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.48341596126556396},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.45642468333244324},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3753593862056732},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35063111782073975},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10889311","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889311","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2031865865","https://openalex.org/W2033875152","https://openalex.org/W2204091679","https://openalex.org/W2810934215","https://openalex.org/W3015591594","https://openalex.org/W4235140465","https://openalex.org/W4372260310","https://openalex.org/W4381786045","https://openalex.org/W4385822571","https://openalex.org/W4390872297","https://openalex.org/W4393147260","https://openalex.org/W4396877837","https://openalex.org/W4401110409","https://openalex.org/W6776218486","https://openalex.org/W6809884996","https://openalex.org/W6841982715","https://openalex.org/W6849109464","https://openalex.org/W6849517043","https://openalex.org/W6853096648","https://openalex.org/W6853515095","https://openalex.org/W6858340110","https://openalex.org/W6858775574","https://openalex.org/W6861353174","https://openalex.org/W6864798992","https://openalex.org/W6875330338"],"related_works":["https://openalex.org/W4365211920","https://openalex.org/W3014948380","https://openalex.org/W4391584540","https://openalex.org/W4380551139","https://openalex.org/W4317695495","https://openalex.org/W4395044357","https://openalex.org/W4287117424","https://openalex.org/W4387506531","https://openalex.org/W2087346071","https://openalex.org/W2967848559"],"abstract_inverted_index":{"We":[0,107],"introduce":[1],"ImmerseDiffusion,":[2],"an":[3],"end-to-end":[4],"generative":[5,53],"audio":[6,38,60,65,93],"model":[7,72,126],"that":[8,43,62,162],"produces":[9],"3D":[10],"immersive":[11],"soundscapes":[12],"conditioned":[13],"on":[14,75],"the":[15,112,118,125,137,166],"spatial,":[16,83],"temporal,":[17],"and":[18,85,89,94,102,114,133,147,154,169],"environmental":[19,86],"conditions":[20,168],"of":[21,57,117,130],"sound":[22],"objects.":[23],"ImmerseDiffusion":[24],"is":[25,34,55],"trained":[26,73,97],"to":[27,47,66,110],"generate":[28],"first-order":[29],"ambisonics":[30],"(FOA)":[31],"audio,":[32],"which":[33,142,149],"a":[35,58,69,91,99],"conventional":[36],"spatial":[37,49,59,92,115,120,134,144,155,172],"format":[39],"comprising":[40],"four":[41],"channels":[42],"can":[44],"be":[45],"rendered":[46],"multichannel":[48],"output.":[50],"The":[51],"proposed":[52,139],"system":[54],"composed":[56],"codec":[61],"maps":[63],"FOA":[64],"latent":[67,70],"components,":[68],"diffusion":[71],"based":[74],"various":[76],"user":[77,167],"input":[78],"types,":[79],"namely,":[80],"text":[81,95,145,152],"prompts,":[82],"temporal":[84],"acoustic":[87],"parameters,":[88],"optionally":[90],"encoder":[96],"in":[98,128],"Contrastive":[100],"Language":[101],"Audio":[103],"Pretraining":[104],"(CLAP)":[105],"style.":[106],"propose":[108],"metrics":[109],"evaluate":[111],"quality":[113,132],"adherence":[116],"generated":[119],"audio.":[121],"Finally,":[122],"we":[123],"assess":[124],"performance":[127],"terms":[129],"generation":[131],"conformance,":[135],"comparing":[136],"two":[138],"modes:":[140],"\"descriptive\",":[141],"uses":[143,150],"prompts)":[146],"\"parametric\",":[148],"non-spatial":[151],"prompts":[153],"parameters.":[156],"Our":[157],"evaluations":[158],"demonstrate":[159],"promising":[160],"results":[161],"are":[163],"consistent":[164],"with":[165],"reflect":[170],"reliable":[171],"fidelity.":[173]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":3}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
