{"id":"https://openalex.org/W4415540936","doi":"https://doi.org/10.1145/3746027.3755656","title":"Controllable Video-to-Music Generation with Multiple Time-Varying Conditions","display_name":"Controllable Video-to-Music Generation with Multiple Time-Varying Conditions","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415540936","doi":"https://doi.org/10.1145/3746027.3755656"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755656","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755656","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5004500750","display_name":"Junxian Wu","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Junxian Wu","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0005-8447-2121","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066710323","display_name":"Weitao You","orcid":"https://orcid.org/0000-0002-9625-5547"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weitao You","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-9625-5547","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092767967","display_name":"Heda Zuo","orcid":"https://orcid.org/0009-0005-7999-2317"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Heda Zuo","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0005-7999-2317","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076060827","display_name":"Dengming Zhang","orcid":"https://orcid.org/0000-0002-6307-7692"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dengming Zhang","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-6307-7692","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058762433","display_name":"Pei Chen","orcid":"https://orcid.org/0000-0003-0962-6459"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pei Chen","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-0962-6459","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036387698","display_name":"Lingyun Sun","orcid":"https://orcid.org/0000-0002-5561-0493"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lingyun Sun","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-5561-0493","affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5004500750"],"corresponding_institution_ids":["https://openalex.org/I76130692"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.37116674,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"10427","last_page":"10436"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5253000259399414},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.5081999897956848},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.4505999982357025},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.4133000075817108},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.37630000710487366},{"id":"https://openalex.org/keywords/feature-selection","display_name":"Feature selection","score":0.3646000027656555}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7475000023841858},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5253000259399414},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.5081999897956848},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.4505999982357025},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43230000138282776},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4133000075817108},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.37630000710487366},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.3646000027656555},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.32249999046325684},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2867000102996826},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.2615000009536743},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.25870001316070557},{"id":"https://openalex.org/C175309249","wikidata":"https://www.wikidata.org/wiki/Q725864","display_name":"Pipeline transport","level":2,"score":0.2563000023365021}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755656","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755656","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2149628368","https://openalex.org/W2939574508","https://openalex.org/W3207290297","https://openalex.org/W4372260250","https://openalex.org/W4372260310","https://openalex.org/W4377719641","https://openalex.org/W4386057769","https://openalex.org/W4386071707","https://openalex.org/W4390873054","https://openalex.org/W4390873340","https://openalex.org/W4392647990","https://openalex.org/W4398226295"],"related_works":[],"abstract_inverted_index":{"Music":[0],"enhances":[1],"video":[2],"narratives":[3],"and":[4,75,97,120,129,149,155],"emotions,":[5],"driving":[6],"demand":[7],"for":[8,56,83],"automatic":[9],"video-to-music":[10],"(V2M)":[11],"generation.":[12,61],"However,":[13],"existing":[14,143],"V2M":[15,48,73,144],"methods":[16],"relying":[17],"solely":[18],"on":[19],"visual":[20],"features":[21],"or":[22],"supplementary":[23],"textual":[24],"inputs":[25],"generate":[26],"music":[27,60,133],"in":[28,146],"a":[29,44,65,92,98,115,121],"black-box":[30],"manner,":[31],"often":[32],"failing":[33],"to":[34,104,125],"meet":[35],"user":[36,158],"expectations.":[37,159],"To":[38],"address":[39],"this":[40],"challenge,":[41],"we":[42,90,113],"propose":[43],"novel":[45],"multi-condition":[46,84],"guided":[47],"generation":[49],"framework":[50],"that":[51,69,139],"incorporates":[52],"multiple":[53,127],"time-varying":[54],"conditions":[55,128],"enhanced":[57],"control":[58,154],"over":[59],"Our":[62],"method":[63,141],"uses":[64],"two-stage":[66],"training":[67],"strategy":[68],"enables":[70],"learning":[71],"of":[72],"fundamentals":[74],"audiovisual":[76],"temporal":[77,100],"synchronization":[78],"while":[79],"meeting":[80],"users'":[81],"needs":[82],"control.":[85],"In":[86],"the":[87,110,132],"first":[88],"stage,":[89,112],"introduce":[91],"fine-grained":[93],"feature":[94,107],"selection":[95],"module":[96,119,124],"progressive":[99],"alignment":[101,156],"attention":[102],"mechanism":[103],"ensure":[105],"flexible":[106],"alignment.":[108],"For":[109],"second":[111],"develop":[114],"dynamic":[116],"conditional":[117],"fusion":[118],"control-guided":[122],"decoder":[123],"integrate":[126],"accurately":[130],"guide":[131],"composition":[134],"process.":[135],"Extensive":[136],"experiments":[137],"demonstrate":[138],"our":[140],"outperforms":[142],"pipelines":[145],"both":[147],"subjective":[148],"objective":[150],"evaluations,":[151],"significantly":[152],"enhancing":[153],"with":[157]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-25T00:00:00"}
