{"id":"https://openalex.org/W7108614071","doi":"https://doi.org/10.5281/zenodo.17811355","title":"A Survey on Vision-to-Music Generation: Methods, Datasets, Evaluation, and Challenges","display_name":"A Survey on Vision-to-Music Generation: Methods, Datasets, Evaluation, and Challenges","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7108614071","doi":"https://doi.org/10.5281/zenodo.17811355"},"language":null,"primary_location":{"id":"doi:10.5281/zenodo.17811355","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811355","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.17811355","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhaokai Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhaokai Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chenxi Bao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chenxi Bao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Le Zhuo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Le Zhuo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Jingrui Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jingrui Han","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang Yue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang Yue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yihong Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yihong Tang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Victor Shea-Jay Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Victor Shea-Jay Huang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Yue Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yue Liao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.59734002,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.43140000104904175,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.43140000104904175,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.16189999878406525,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.11990000307559967,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.6915000081062317},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.506600022315979},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3418999910354614},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.32170000672340393},{"id":"https://openalex.org/keywords/music-information-retrieval","display_name":"Music information retrieval","score":0.3021000027656555},{"id":"https://openalex.org/keywords/open-research","display_name":"Open research","score":0.2928999960422516}],"concepts":[{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.6915000081062317},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6651999950408936},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.6093000173568726},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.506600022315979},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37560001015663147},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.3546999990940094},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3418999910354614},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.32170000672340393},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.3021000027656555},{"id":"https://openalex.org/C2778464652","wikidata":"https://www.wikidata.org/wiki/Q309849","display_name":"Open research","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C157170001","wikidata":"https://www.wikidata.org/wiki/Q4781507","display_name":"Applications of artificial intelligence","level":2,"score":0.2842999994754791},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.2833999991416931},{"id":"https://openalex.org/C2779182362","wikidata":"https://www.wikidata.org/wiki/Q17126187","display_name":"Session (web analytics)","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C2986817661","wikidata":"https://www.wikidata.org/wiki/Q185698","display_name":"Research methodology","level":3,"score":0.25600001215934753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.17811355","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811355","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.17811355","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811355","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.6486297249794006,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-to-music":[0],"Generation,":[1],"including":[2],"video-to-music":[3],"and":[4,21,41,83,95,106,125,135,149,159],"image-to-music":[5],"tasks,":[6],"is":[7,29,128],"a":[8],"significant":[9],"branch":[10],"of":[11,44,74,103,122,153],"multimodal":[12,154],"artificial":[13],"intelligence":[14],"demonstrating":[15],"vast":[16],"applications":[17],"like":[18],"film":[19],"scoring":[20],"short":[22],"video":[23],"creation.":[24],"However,":[25],"research":[26,69,158],"in":[27,31,71,146,156],"vision-to-music":[28,75,147],"still":[30],"its":[32,37],"preliminary":[33],"stage":[34],"due":[35],"to":[36],"complex":[38],"internal":[39],"structure":[40],"the":[42,68,72,80,112,116,150],"difficulty":[43],"modeling":[45],"dynamic":[46],"relationships":[47],"with":[48],"video.":[49],"Existing":[50],"surveys":[51],"focus":[52],"on":[53,60],"general":[54,90],"music":[55,105],"generation":[56,148,155],"without":[57],"comprehensive":[58],"discussion":[59],"vision-to-music.":[61],"In":[62],"this":[63],"paper,":[64],"we":[65,131],"systematically":[66],"review":[67,121],"progress":[70],"field":[73,152],"generation.":[76],"We":[77,109,138],"first":[78],"analyze":[79],"technical":[81],"characteristics":[82],"core":[84],"challenges":[85,134],"for":[86],"three":[87],"input":[88],"types:":[89],"videos,":[91,94],"human":[92],"movement":[93],"images,":[96],"as":[97,99],"well":[98],"two":[100],"output":[101],"types":[102],"symbolic":[104],"audio":[107],"music.":[108],"then":[110],"summarize":[111],"existing":[113],"methodologies":[114],"from":[115],"architecture":[117],"perspective.":[118],"A":[119],"detailed":[120],"common":[123],"datasets":[124],"evaluation":[126],"metrics":[127],"provided.":[129],"Finally,":[130],"discuss":[132],"current":[133],"future":[136],"directions.":[137],"hope":[139],"our":[140],"survey":[141],"can":[142],"inspire":[143],"further":[144],"innovation":[145],"broader":[151],"academic":[157],"industrial":[160],"applications.":[161]},"counts_by_year":[],"updated_date":"2025-12-05T23:25:22.460635","created_date":"2025-12-05T00:00:00"}
