{"id":"https://openalex.org/W7108747446","doi":"https://doi.org/10.5281/zenodo.17811375","title":"Instruct-MusicGen: Unlocking Text-to-Music Editing for Music Language Models via Instruction Tuning","display_name":"Instruct-MusicGen: Unlocking Text-to-Music Editing for Music Language Models via Instruction Tuning","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7108747446","doi":"https://doi.org/10.5281/zenodo.17811375"},"language":null,"primary_location":{"id":"doi:10.5281/zenodo.17811375","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811375","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.17811375","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yixiao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yixiao Zhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yukara Ikemiya","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yukara Ikemiya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Woosung Choi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Woosung Choi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Naoki Murata","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Naoki Murata","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Marco Mart\u00ednez-Ram\u00edrez","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Marco Mart\u00ednez-Ram\u00edrez","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liwei Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liwei Lin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gus Xia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gus Xia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wei-Hsiang Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei-Hsiang Liao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yuki Mitsufuji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuki Mitsufuji","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Simon Dixon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Simon Dixon","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.62860218,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.7592999935150146,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.7592999935150146,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.09790000319480896,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.01810000091791153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5735999941825867},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.569100022315979},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.4602999985218048},{"id":"https://openalex.org/keywords/train","display_name":"Train","score":0.3513000011444092},{"id":"https://openalex.org/keywords/music-information-retrieval","display_name":"Music information retrieval","score":0.33230000734329224},{"id":"https://openalex.org/keywords/pop-music-automation","display_name":"Pop music automation","score":0.3116999864578247}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8445000052452087},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5735999941825867},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.569100022315979},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.4602999985218048},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4544000029563904},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4309999942779541},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4025999903678894},{"id":"https://openalex.org/C190839683","wikidata":"https://www.wikidata.org/wiki/Q2448197","display_name":"Train","level":2,"score":0.3513000011444092},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.33230000734329224},{"id":"https://openalex.org/C73520026","wikidata":"https://www.wikidata.org/wiki/Q7229091","display_name":"Pop music automation","level":4,"score":0.3116999864578247},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3086000084877014},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.27619999647140503},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.263700008392334},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.26260000467300415}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.17811375","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811375","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.17811375","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811375","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,32,65,189],"text-to-music":[3,178],"editing,":[4],"which":[5,48,122],"employ":[6],"text":[7,114],"queries":[8],"to":[9,41,60,89,126,148,167],"modify":[10],"music":[11,28,186,191],"(e.g.":[12],"by":[13,38,111],"changing":[14],"its":[15],"style":[16],"or":[17,98],"adjusting":[18],"instrumental":[19],"components),":[20],"present":[21],"unique":[22],"challenges":[23],"and":[24,52,73,117,130,134,153],"opportunities":[25],"for":[26,156],"AI-assisted":[27],"creation.":[29],"Previous":[30],"approaches":[31],"this":[33],"domain":[34],"have":[35],"been":[36],"constrained":[37],"the":[39,71,107,124,136,149,175,183],"necessity":[40],"train":[42],"specific":[43],"editing":[44,92,179],"models":[45,59,188],"from":[46],"scratch,":[47],"is":[49],"both":[50],"resource-intensive":[51],"inefficient;":[53],"other":[54],"research":[55],"uses":[56],"large":[57],"language":[58,187],"predict":[61],"edited":[62,138],"music,":[63],"resulting":[64],"imprecise":[66],"audio":[67,119,131],"reconstruction.":[68],"To":[69],"Combine":[70],"strengths":[72],"address":[74],"these":[75],"limitations,":[76],"we":[77],"introduce":[78],"Instruct-MusicGen,":[79],"a":[80,85,104,113],"novel":[81],"approach":[82,102],"that":[83],"finetunes":[84],"pretrained":[86],"MusicGen":[87,109,151],"model":[88,125,152],"efficiently":[90],"follow":[91],"instructions":[93],"such":[94],"as":[95],"adding,":[96],"removing,":[97],"separating":[99],"stems.":[100],"Our":[101],"involves":[103],"modification":[105],"of":[106,177,185],"original":[108,150],"architecture":[110],"incorporating":[112],"fusion":[115,120],"module":[116],"an":[118],"module,":[121],"allow":[123],"process":[127],"instruction":[128],"texts":[129],"inputs":[132],"concurrently":[133],"yield":[135],"desired":[137],"music.":[139],"Remarkably,":[140],"although":[141],"Instruct-MusicGen":[142],"only":[143,154,173],"introduces":[144],"8%":[145],"new":[146],"parameters":[147],"trains":[155],"5K":[157],"steps,":[158],"it":[159],"achieves":[160],"superior":[161],"performance":[162],"across":[163],"all":[164],"tasks":[165],"compared":[166],"existing":[168],"baselines.":[169],"This":[170],"advancement":[171],"not":[172],"enhances":[174],"efficiency":[176],"but":[180],"also":[181],"broadens":[182],"applicability":[184],"dynamic":[190],"production":[192],"environments.":[193]},"counts_by_year":[],"updated_date":"2025-12-05T23:25:22.460635","created_date":"2025-12-05T00:00:00"}
