{"id":"https://openalex.org/W7134234102","doi":"https://doi.org/10.1007/s11263-025-02610-4","title":"Towards Scene-Aware Video-to-Spatial Audio Generation","display_name":"Towards Scene-Aware Video-to-Spatial Audio Generation","publication_year":2026,"publication_date":"2026-03-09","ids":{"openalex":"https://openalex.org/W7134234102","doi":"https://doi.org/10.1007/s11263-025-02610-4"},"language":"en","primary_location":{"id":"doi:10.1007/s11263-025-02610-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11263-025-02610-4","pdf_url":null,"source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1007/s11263-025-02610-4","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128462775","display_name":"Jaeyeon Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"education","lineage":["https://openalex.org/I139264467"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jaeyeon Kim","raw_affiliation_strings":["Seoul National University, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Seoul National University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I139264467"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050409160","display_name":"Heeseung Yun","orcid":null},"institutions":[{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"education","lineage":["https://openalex.org/I139264467"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Heeseung Yun","raw_affiliation_strings":["Seoul National University, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Seoul National University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I139264467"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5128568383","display_name":"Gunhee Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"education","lineage":["https://openalex.org/I139264467"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Gunhee Kim","raw_affiliation_strings":["Seoul National University, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Seoul National University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I139264467"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5128568383"],"corresponding_institution_ids":["https://openalex.org/I139264467"],"apc_list":{"value":2890,"currency":"EUR","value_usd":3690},"apc_paid":{"value":2890,"currency":"EUR","value_usd":3690},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.90251046,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":"134","issue":"4","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.6261000037193298,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.6261000037193298,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.13760000467300415,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.12479999661445618,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/ambisonics","display_name":"Ambisonics","score":0.6039999723434448},{"id":"https://openalex.org/keywords/chaining","display_name":"Chaining","score":0.5393999814987183},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.5231999754905701},{"id":"https://openalex.org/keywords/codebook","display_name":"Codebook","score":0.4320000112056732},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.41999998688697815},{"id":"https://openalex.org/keywords/sound-quality","display_name":"Sound quality","score":0.39570000767707825},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.37450000643730164},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.35659998655319214}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8675000071525574},{"id":"https://openalex.org/C47726159","wikidata":"https://www.wikidata.org/wiki/Q457547","display_name":"Ambisonics","level":3,"score":0.6039999723434448},{"id":"https://openalex.org/C49020025","wikidata":"https://www.wikidata.org/wiki/Q1059099","display_name":"Chaining","level":2,"score":0.5393999814987183},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.5231999754905701},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4650999903678894},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44920000433921814},{"id":"https://openalex.org/C127759330","wikidata":"https://www.wikidata.org/wiki/Q637416","display_name":"Codebook","level":2,"score":0.4320000112056732},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.41999998688697815},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.39570000767707825},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.37450000643730164},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.35659998655319214},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.3465999960899353},{"id":"https://openalex.org/C150178126","wikidata":"https://www.wikidata.org/wiki/Q18433212","display_name":"Dynamic range compression","level":2,"score":0.3427000045776367},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.3255999982357025},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.31690001487731934},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2935999929904938},{"id":"https://openalex.org/C2985909886","wikidata":"https://www.wikidata.org/wiki/Q193147","display_name":"Spatial coherence","level":3,"score":0.29249998927116394},{"id":"https://openalex.org/C159379195","wikidata":"https://www.wikidata.org/wiki/Q7239568","display_name":"Precomputation","level":3,"score":0.2865999937057495},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.27570000290870667},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.27320000529289246},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.25999999046325684}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s11263-025-02610-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11263-025-02610-4","pdf_url":null,"source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s11263-025-02610-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11263-025-02610-4","pdf_url":null,"source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Affordable and clean energy","score":0.656832218170166,"id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":71,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2016348563","https://openalex.org/W2194775991","https://openalex.org/W2526050071","https://openalex.org/W2565639579","https://openalex.org/W2593116425","https://openalex.org/W2798665861","https://openalex.org/W2937949426","https://openalex.org/W2963503775","https://openalex.org/W2963807156","https://openalex.org/W2964345931","https://openalex.org/W2972359262","https://openalex.org/W2972478942","https://openalex.org/W2982624843","https://openalex.org/W3005064518","https://openalex.org/W3015371781","https://openalex.org/W3049847664","https://openalex.org/W3096780661","https://openalex.org/W3174854700","https://openalex.org/W3180355996","https://openalex.org/W3197273793","https://openalex.org/W3205475937","https://openalex.org/W4214604251","https://openalex.org/W4236344233","https://openalex.org/W4236695114","https://openalex.org/W4290097456","https://openalex.org/W4312724358","https://openalex.org/W4312777209","https://openalex.org/W4313021454","https://openalex.org/W4372260310","https://openalex.org/W4372348103","https://openalex.org/W4381333750","https://openalex.org/W4385823191","https://openalex.org/W4386071828","https://openalex.org/W4387872943","https://openalex.org/W4390874242","https://openalex.org/W4392902609","https://openalex.org/W4392902753","https://openalex.org/W4392902843","https://openalex.org/W4392902968","https://openalex.org/W4392903177","https://openalex.org/W4392903391","https://openalex.org/W4392910528","https://openalex.org/W4393160294","https://openalex.org/W4394891064","https://openalex.org/W4402111255","https://openalex.org/W4402112400","https://openalex.org/W4402670057","https://openalex.org/W4402671950","https://openalex.org/W4402727052","https://openalex.org/W4403906568","https://openalex.org/W4404037650","https://openalex.org/W4404545746","https://openalex.org/W4408347147","https://openalex.org/W4408352114","https://openalex.org/W4408352258","https://openalex.org/W4408352493","https://openalex.org/W4408356092","https://openalex.org/W4411245083","https://openalex.org/W4412944874","https://openalex.org/W4413144830","https://openalex.org/W4413145987","https://openalex.org/W4413147701","https://openalex.org/W4415796284","https://openalex.org/W4415798716","https://openalex.org/W6912494966","https://openalex.org/W7133218093","https://openalex.org/W7133218909","https://openalex.org/W7133219565","https://openalex.org/W7133224666","https://openalex.org/W7133229781"],"related_works":[],"abstract_inverted_index":{"Abstract":[0],"Spatial":[1],"audio":[2,39,83,130,206],"is":[3],"essential":[4],"for":[5,82,175,180],"enhancing":[6],"the":[7],"immersiveness":[8],"of":[9,31,101],"audio-visual":[10],"experiences,":[11],"yet":[12],"its":[13,86],"production":[14],"typically":[15],"demands":[16],"complex":[17,149],"recording":[18],"systems":[19],"and":[20,61,85,128,162,177,196],"specialized":[21],"expertise.":[22],"In":[23],"this":[24,47],"work,":[25],"we":[26,49,105,139],"address":[27,136],"a":[28,35,70,95,141],"novel":[29],"problem":[30],"generating":[32,203],"first-order":[33,79,116],"ambisonics,":[34],"widely":[36],"used":[37],"spatial":[38,62,194,205],"format,":[40],"directly":[41],"from":[42,118,207],"silent":[43,119],"videos.":[44],"To":[45,135],"support":[46],"task,":[48],"develop":[50],"comprehensive":[51],"evaluation":[52],"metrics":[53,195],"that":[54,114,186],"capture":[55],"both":[56],"standard":[57],"video-to-audio":[58],"generation":[59],"quality":[60],"coherence":[63],"among":[64],"multiple":[65],"channels.":[66],"We":[67],"introduce":[68],"YT-Ambigen,":[69],"dataset":[71],"comprising":[72],"102K":[73],"YouTube":[74],"video":[75,208],"clips":[76,93],"paired":[77],"with":[78,94,132,152],"ambisonics":[80,117],"tailored":[81],"generation,":[84],"expanded":[87],"version":[88],"YT-Ambigen+":[89],"containing":[90],"3x":[91],"more":[92],"rigorously":[96],"validated":[97],"high-quality":[98,204],"test":[99],"subset":[100],"19.3K":[102],"clips.":[103],"Furthermore,":[104],"present":[106],"Video-to-Spatial":[107],"Audio":[108],"Generation":[109],"(ViSAGe),":[110],"an":[111,153],"end-to-end":[112],"framework":[113],"generates":[115],"videos":[120],"by":[121],"leveraging":[122],"CLIP":[123],"features,":[124],"patchwise":[125],"energy":[126],"maps,":[127],"neural":[129],"codecs":[131],"rotation":[133],"augmentation.":[134],"efficiency":[137],"challenges,":[138],"propose":[140],"variant":[142],"coined":[143],"ViSAGe-SC":[144,170],"(Single":[145],"Codebook),":[146],"which":[147],"replaces":[148],"residual":[150],"codebooks":[151],"optimized":[154],"single":[155],"codebook":[156],"approach,":[157],"achieving":[158],"4x":[159],"faster":[160,164],"training":[161],"5x":[163],"inference":[165],"while":[166],"maintaining":[167],"superior":[168],"performance.":[169],"incorporates":[171],"heterogeneous":[172],"codec":[173],"chaining":[174],"postprocessing":[176],"candidate":[178],"reranking":[179],"inference-time":[181],"refinement.":[182],"Experimental":[183],"results":[184],"demonstrate":[185],"our":[187],"approach":[188],"outperforms":[189],"several":[190],"V2A":[191],"models":[192],"across":[193],"displays":[197],"competitive":[198],"performance":[199],"in":[200],"semantic":[201],"quality,":[202],"input.":[209]},"counts_by_year":[],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2026-03-10T00:00:00"}
