{"id":"https://openalex.org/W4410660786","doi":"https://doi.org/10.1109/access.2025.3572954","title":"VT2Music: A Multimodal Framework for Text-Visual Guided Music Generation and Comprehensive Performance Analysis","display_name":"VT2Music: A Multimodal Framework for Text-Visual Guided Music Generation and Comprehensive Performance Analysis","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4410660786","doi":"https://doi.org/10.1109/access.2025.3572954"},"language":"en","primary_location":{"id":"doi:10.1109/access.2025.3572954","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3572954","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1109/access.2025.3572954","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5087251080","display_name":"Jiaxiang Zheng","orcid":"https://orcid.org/0009-0000-1407-8894"},"institutions":[{"id":"https://openalex.org/I165507594","display_name":"Kangwon National University","ror":"https://ror.org/01mh5ph17","country_code":"KR","type":"education","lineage":["https://openalex.org/I165507594"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Jiaxiang Zheng","raw_affiliation_strings":["Department of Global Cultural Convergence, Graduate School, Kangwon National University, Chuncheon, Gangwon-do, South Korea"],"raw_orcid":"https://orcid.org/0009-0000-1407-8894","affiliations":[{"raw_affiliation_string":"Department of Global Cultural Convergence, Graduate School, Kangwon National University, Chuncheon, Gangwon-do, South Korea","institution_ids":["https://openalex.org/I165507594"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050958406","display_name":"Moxi Cao","orcid":"https://orcid.org/0009-0000-2769-2316"},"institutions":[{"id":"https://openalex.org/I165507594","display_name":"Kangwon National University","ror":"https://ror.org/01mh5ph17","country_code":"KR","type":"education","lineage":["https://openalex.org/I165507594"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Moxi Cao","raw_affiliation_strings":["Department of Global Cultural Convergence, Graduate School, Kangwon National University, Chuncheon, Gangwon-do, South Korea"],"raw_orcid":"https://orcid.org/0009-0000-2769-2316","affiliations":[{"raw_affiliation_string":"Department of Global Cultural Convergence, Graduate School, Kangwon National University, Chuncheon, Gangwon-do, South Korea","institution_ids":["https://openalex.org/I165507594"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5019983629","display_name":"Chongbin Zhang","orcid":"https://orcid.org/0009-0004-5214-9840"},"institutions":[{"id":"https://openalex.org/I4210138978","display_name":"Nanjing University of the Arts","ror":"https://ror.org/0457pxg28","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210138978"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chongbin Zhang","raw_affiliation_strings":["School of Modern Music and Technology, Nanjing University of the Arts, Nanjing, Jiangsu, China","School of Modern Music and Technology, Nanjing University of the Arts, 74 Beijing West Road, Nanjing, Jiangsu Province, China"],"raw_orcid":"https://orcid.org/0009-0004-5214-9840","affiliations":[{"raw_affiliation_string":"School of Modern Music and Technology, Nanjing University of the Arts, Nanjing, Jiangsu, China","institution_ids":["https://openalex.org/I4210138978"]},{"raw_affiliation_string":"School of Modern Music and Technology, Nanjing University of the Arts, 74 Beijing West Road, Nanjing, Jiangsu Province, China","institution_ids":["https://openalex.org/I4210138978"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5087251080"],"corresponding_institution_ids":["https://openalex.org/I165507594"],"apc_list":{"value":1850,"currency":"USD","value_usd":1850},"apc_paid":{"value":1850,"currency":"USD","value_usd":1850},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.11565945,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"13","issue":null,"first_page":"92641","last_page":"92662"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9793000221252441,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9641000032424927,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7546664476394653},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.45209792256355286},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.39316314458847046},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.380718469619751}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7546664476394653},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.45209792256355286},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.39316314458847046},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.380718469619751}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/access.2025.3572954","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3572954","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:f30f9090959c42378740c03b4cd122d1","is_oa":true,"landing_page_url":"https://doaj.org/article/f30f9090959c42378740c03b4cd122d1","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"IEEE Access, Vol 13, Pp 92641-92662 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1109/access.2025.3572954","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3572954","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.5400000214576721,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":64,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W1999536255","https://openalex.org/W2002055708","https://openalex.org/W2006244727","https://openalex.org/W2015959845","https://openalex.org/W2036116435","https://openalex.org/W2052170734","https://openalex.org/W2060437114","https://openalex.org/W2062985968","https://openalex.org/W2064675550","https://openalex.org/W2765811365","https://openalex.org/W2772474126","https://openalex.org/W2910720953","https://openalex.org/W3092879656","https://openalex.org/W3094502228","https://openalex.org/W3108240585","https://openalex.org/W3156892778","https://openalex.org/W3176641147","https://openalex.org/W3204221554","https://openalex.org/W3207290297","https://openalex.org/W3215615641","https://openalex.org/W4214612132","https://openalex.org/W4252337780","https://openalex.org/W4297841924","https://openalex.org/W4312814772","https://openalex.org/W4312933868","https://openalex.org/W4372266552","https://openalex.org/W4381786045","https://openalex.org/W4385245566","https://openalex.org/W4385585031","https://openalex.org/W4386071707","https://openalex.org/W4388891078","https://openalex.org/W4390873340","https://openalex.org/W4392903114","https://openalex.org/W4393148499","https://openalex.org/W4401043564","https://openalex.org/W4401110409","https://openalex.org/W4402987512","https://openalex.org/W6611478984","https://openalex.org/W6712079358","https://openalex.org/W6746596988","https://openalex.org/W6749463158","https://openalex.org/W6756746141","https://openalex.org/W6757220786","https://openalex.org/W6760601182","https://openalex.org/W6776218486","https://openalex.org/W6779823529","https://openalex.org/W6787185846","https://openalex.org/W6797359156","https://openalex.org/W6802987763","https://openalex.org/W6841982715","https://openalex.org/W6845281891","https://openalex.org/W6846234567","https://openalex.org/W6847076894","https://openalex.org/W6848697212","https://openalex.org/W6849105126","https://openalex.org/W6849109464","https://openalex.org/W6849416043","https://openalex.org/W6849517043","https://openalex.org/W6849635556","https://openalex.org/W6853096648","https://openalex.org/W6861353174","https://openalex.org/W6872828326","https://openalex.org/W7019574941"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Recent":[0],"years":[1],"have":[2],"witnessed":[3],"significant":[4],"advances":[5],"in":[6,50],"text-to-music":[7],"generation":[8,55,85,110,161],"technology":[9],"through":[10],"deep":[11],"learning":[12],"approaches,":[13],"particularly":[14],"using":[15],"latent":[16],"diffusion":[17,89],"models":[18,162],"(LDM),":[19],"yet":[20],"there":[21],"remains":[22],"a":[23,82],"notable":[24],"absence":[25],"of":[26,34,93,151,176],"artificial":[27],"intelligence":[28],"(AI)":[29],"music":[30,36,54,84,97,109,141,160,178],"composition":[31],"systems":[32],"capable":[33,92],"generating":[35,94],"from":[37,56,98,111],"other":[38],"modalities.":[39],"Given":[40],"the":[41,146,152,174,186],"intricate":[42],"relationship":[43],"between":[44],"visual":[45,101,188],"perception":[46],"and":[47,66,100,132,148,191],"auditory":[48],"experience":[49],"human":[51],"cognition,":[52],"exploring":[53],"multimodal":[57,83,122,177],"data":[58],"holds":[59],"considerable":[60],"promise":[61],"for":[62],"creating":[63],"more":[64],"diverse":[65],"enriched":[67],"musical":[68,192],"experiences.":[69],"To":[70],"address":[71],"this":[72],"research":[73],"gap,":[74],"we":[75,135],"propose":[76],"VT2Music":[77,138],"(various":[78],"things":[79],"to":[80,157],"music),":[81],"model":[86],"based":[87],"on":[88],"transformers":[90],"(DiT),":[91],"semantically":[95],"aligned":[96],"textual":[99],"modality":[102],"data.":[103],"Our":[104],"framework":[105],"not":[106],"only":[107],"supports":[108],"single":[112],"modalities":[113],"(text,":[114],"image,":[115],"or":[116,127],"video),":[117],"but":[118],"also":[119],"accepts":[120],"combined":[121],"inputs":[123],"(such":[124],"as":[125],"text+image":[126],"text+video).":[128],"Through":[129],"both":[130],"objective":[131],"subjective":[133],"evaluations,":[134],"demonstrate":[136],"that":[137,142],"can":[139],"generate":[140],"reasonably":[143],"aligns":[144],"with":[145,180],"semantic":[147],"emotional":[149],"content":[150],"input,":[153],"achieving":[154],"performance":[155],"comparable":[156],"current":[158],"mainstream":[159],"across":[163],"multiple":[164],"assessment":[165],"metrics.":[166],"This":[167],"study":[168],"represents":[169],"an":[170],"initial":[171],"exploration":[172],"into":[173],"possibilities":[175],"generation,":[179],"future":[181],"work":[182],"aimed":[183],"at":[184],"enhancing":[185],"model\u2019s":[187],"feature":[189],"comprehension":[190],"naturalness.":[193]},"counts_by_year":[],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2025-10-10T00:00:00"}
