{"id":"https://openalex.org/W4415536516","doi":"https://doi.org/10.1145/3746027.3755523","title":"Spatial-Temporal Decomposition and Alignment in Controllable Video-to-Music Generation","display_name":"Spatial-Temporal Decomposition and Alignment in Controllable Video-to-Music Generation","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415536516","doi":"https://doi.org/10.1145/3746027.3755523"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755523","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755523","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066710323","display_name":"Weitao You","orcid":"https://orcid.org/0000-0002-9625-5547"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Weitao You","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-9625-5547","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092767967","display_name":"Heda Zuo","orcid":"https://orcid.org/0009-0005-7999-2317"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Heda Zuo","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0005-7999-2317","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004500750","display_name":"Junxian Wu","orcid":"https://orcid.org/0009-0005-8447-2121"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junxian Wu","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0005-8447-2121","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076060827","display_name":"Dengming Zhang","orcid":"https://orcid.org/0000-0002-6307-7692"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dengming Zhang","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-6307-7692","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101454180","display_name":"Zhibin Zhou","orcid":"https://orcid.org/0000-0001-9545-3763"},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Zhibin Zhou","raw_affiliation_strings":["Hong Kong Polytechnic University, Hong Kong, China"],"raw_orcid":"https://orcid.org/0000-0001-9545-3763","affiliations":[{"raw_affiliation_string":"Hong Kong Polytechnic University, Hong Kong, China","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036387698","display_name":"Lingyun Sun","orcid":"https://orcid.org/0000-0002-5561-0493"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lingyun Sun","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-5561-0493","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5066710323"],"corresponding_institution_ids":["https://openalex.org/I76130692"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28373202,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"10278","last_page":"10286"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10481","display_name":"Computer Graphics and Visualization Techniques","score":0.9886000156402588,"subfield":{"id":"https://openalex.org/subfields/1704","display_name":"Computer Graphics and Computer-Aided Design"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9851999878883362,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/controllability","display_name":"Controllability","score":0.9562000036239624},{"id":"https://openalex.org/keywords/decomposition","display_name":"Decomposition","score":0.6798999905586243},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5792999863624573},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5284000039100647},{"id":"https://openalex.org/keywords/transformation","display_name":"Transformation (genetics)","score":0.46880000829696655},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4471000134944916}],"concepts":[{"id":"https://openalex.org/C48209547","wikidata":"https://www.wikidata.org/wiki/Q1331104","display_name":"Controllability","level":2,"score":0.9562000036239624},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.6798999905586243},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6514999866485596},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5792999863624573},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5311999917030334},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5284000039100647},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.46880000829696655},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4471000134944916},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.3433000147342682},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32710000872612},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2976999878883362},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C133731056","wikidata":"https://www.wikidata.org/wiki/Q4917288","display_name":"Control engineering","level":1,"score":0.27390000224113464},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25949999690055847},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755523","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755523","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W2063948594","https://openalex.org/W2095380808","https://openalex.org/W2972478942","https://openalex.org/W3132455321","https://openalex.org/W3207290297","https://openalex.org/W4213276648","https://openalex.org/W4300861819","https://openalex.org/W4319308497","https://openalex.org/W4322769471","https://openalex.org/W4372266552","https://openalex.org/W4390873340","https://openalex.org/W4390874575","https://openalex.org/W4393343355","https://openalex.org/W4403791714"],"related_works":[],"abstract_inverted_index":{"Achieving":[0],"high-quality":[1],"output":[2],"alongside":[3],"enhanced":[4],"controllability":[5,96,117],"is":[6],"crucial":[7],"in":[8,16,59,73,111],"video-to-music":[9,61],"generation,":[10],"especially":[11],"for":[12],"optimizing":[13],"user":[14],"experience":[15],"real-life":[17],"application":[18],"scenarios.":[19],"Most":[20],"existing":[21,120],"studies":[22],"emphasize":[23],"generative":[24,113],"quality,":[25],"but":[26],"often":[27],"overlooking":[28],"the":[29,35,54,81],"vital":[30],"aspect":[31],"of":[32],"controllability.":[33],"Therefore,":[34],"generated":[36],"music":[37],"cannot":[38],"be":[39],"easily":[40],"fine-tuned":[41],"or":[42],"modified":[43],"to":[44,126],"meet":[45],"users'":[46,128],"expectations.":[47],"In":[48],"this":[49],"paper,":[50],"we":[51],"delve":[52],"into":[53],"spatial-temporal":[55],"decomposition":[56,69],"and":[57,70,76,79,87,130],"alignment":[58,86],"controllable":[60],"generation.":[62],"We":[63],"first":[64],"introduce":[65],"a":[66],"novel":[67],"video-music":[68],"transformation":[71],"approach":[72],"both":[74],"spatial":[75],"temporal":[77],"domain,":[78],"enhance":[80],"cross-modal":[82],"correspondence":[83],"through":[84],"feature":[85],"flow-matching":[88],"based":[89],"alignment.":[90],"Furthermore,":[91],"our":[92,106],"method":[93],"attains":[94],"unsupervised":[95],"during":[97],"training":[98],"via":[99],"feature-free":[100],"guidance.":[101],"Experimental":[102],"results":[103,110],"demonstrate":[104],"that":[105],"model":[107],"achieves":[108],"state-of-the-art":[109],"overall":[112],"quality.":[114],"Moreover,":[115],"its":[116],"significantly":[118],"outperforms":[119],"models,":[121],"making":[122],"it":[123],"exceptionally":[124],"well-suited":[125],"accommodate":[127],"flexible":[129],"diverse":[131],"control":[132],"requirements.":[133]},"counts_by_year":[],"updated_date":"2026-04-29T09:16:38.111599","created_date":"2025-10-25T00:00:00"}
