{"id":"https://openalex.org/W4408345630","doi":"https://doi.org/10.1109/icassp49660.2025.10890309","title":"Editing Music with Melody and Text: Using ControlNet for Diffusion Transformer","display_name":"Editing Music with Melody and Text: Using ControlNet for Diffusion Transformer","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408345630","doi":"https://doi.org/10.1109/icassp49660.2025.10890309"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890309","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890309","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102735551","display_name":"Siyuan Hou","orcid":"https://orcid.org/0000-0002-5913-2791"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Siyuan Hou","raw_affiliation_strings":["Tsinghua University"],"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066070154","display_name":"Shansong Liu","orcid":"https://orcid.org/0000-0001-6202-5615"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shansong Liu","raw_affiliation_strings":["Tencent PCG,ARC Lab"],"affiliations":[{"raw_affiliation_string":"Tencent PCG,ARC Lab","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067150934","display_name":"Ruibin Yuan","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Ruibin Yuan","raw_affiliation_strings":["Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100652844","display_name":"Wei Xue","orcid":"https://orcid.org/0000-0002-4942-7748"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Wei Xue","raw_affiliation_strings":["Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101413378","display_name":"Ying Shan","orcid":"https://orcid.org/0009-0007-3607-9506"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Shan","raw_affiliation_strings":["Tencent PCG,ARC Lab"],"affiliations":[{"raw_affiliation_string":"Tencent PCG,ARC Lab","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109200535","display_name":"Man Zhao","orcid":"https://orcid.org/0009-0006-3817-2731"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mangsuo Zhao","raw_affiliation_strings":["Tsinghua University"],"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100460223","display_name":"Chao Zhang","orcid":"https://orcid.org/0000-0002-9583-0722"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Zhang","raw_affiliation_strings":["Tsinghua University"],"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5102735551"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":5.1131,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.94676495,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.946399986743927,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.946399986743927,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9352999925613403,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9020000100135803,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6647696495056152},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6377647519111633},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5397208333015442},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.42622435092926025},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.34997284412384033},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.21056395769119263},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.10885852575302124},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.10050216317176819},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.0590568482875824}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6647696495056152},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6377647519111633},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5397208333015442},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.42622435092926025},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34997284412384033},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.21056395769119263},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10885852575302124},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.10050216317176819},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0590568482875824},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890309","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890309","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-159965","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-159965","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W2076608692","https://openalex.org/W3094550259","https://openalex.org/W4221159371","https://openalex.org/W4312933868","https://openalex.org/W4390872297","https://openalex.org/W4390873054","https://openalex.org/W4392903114","https://openalex.org/W4392904237","https://openalex.org/W4392963916","https://openalex.org/W4393158405","https://openalex.org/W4396877837","https://openalex.org/W4398226295","https://openalex.org/W4401023668","https://openalex.org/W4401110409","https://openalex.org/W4401307059","https://openalex.org/W4402671601","https://openalex.org/W4403674737","https://openalex.org/W4408345930","https://openalex.org/W4409365005","https://openalex.org/W4411528087","https://openalex.org/W4412172710","https://openalex.org/W6678969435","https://openalex.org/W6732646663","https://openalex.org/W6746836464","https://openalex.org/W6763945542","https://openalex.org/W6779823529","https://openalex.org/W6840815571","https://openalex.org/W6848578254","https://openalex.org/W6848854281","https://openalex.org/W6849105126","https://openalex.org/W6849109464","https://openalex.org/W6853096648","https://openalex.org/W6857054612","https://openalex.org/W6857775612","https://openalex.org/W6858779713","https://openalex.org/W6860268860","https://openalex.org/W6860658276","https://openalex.org/W6861353174","https://openalex.org/W6864798992","https://openalex.org/W6868889147","https://openalex.org/W6869165705","https://openalex.org/W6869810883"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Despite":[0],"the":[1,13,22,85,112,129],"significant":[2],"progress":[3],"in":[4,12,133,186],"controllable":[5],"music":[6,19,59,98],"generation":[7,60,145,177,191],"and":[8,15,27,57,61,66,72,117,146,192],"editing,":[9],"challenges":[10],"remain":[11],"quality":[14],"length":[16],"of":[17,24,106,188],"generated":[18],"due":[20],"to":[21,91],"use":[23],"Mel-spectrogram":[25],"representations":[26,93],"UNet-based":[28],"model":[29],"structures.":[30],"To":[31,109],"address":[32],"these":[33],"limitations,":[34],"we":[35,76,120],"propose":[36],"a":[37,41,78,103,122,134,162,182],"novel":[38,79],"approach":[39,168],"using":[40,51,150],"Diffusion":[42],"Transformer":[43],"(DiT)":[44],"augmented":[45],"with":[46,99],"an":[47],"additional":[48],"control":[49,113],"branch":[50],"ControlNet.":[52],"This":[53],"allows":[54],"for":[55,97,195],"long-form":[56],"variable-length":[58],"editing":[62,172],"controlled":[63],"by":[64,159],"text":[65,116],"melody":[67,74,86,118,130,193],"prompts.":[68],"For":[69],"more":[70,135],"precise":[71],"fine-grained":[73],"control,":[75],"introduce":[77],"top-k":[80],"constant-Q":[81],"Transform":[82],"representation":[83],"as":[84],"prompt,":[87,131],"reducing":[88],"ambiguity":[89],"compared":[90],"previous":[92],"(e.g.,":[94],"chroma),":[95],"particularly":[96],"multiple":[100],"tracks":[101],"or":[102],"wide":[104],"range":[105],"pitch":[107],"values.":[108],"effectively":[110],"balance":[111],"signals":[114],"from":[115],"prompts,":[119],"adopt":[121],"curriculum":[123],"learning":[124],"strategy":[125],"that":[126,158],"progressively":[127],"masks":[128],"resulting":[132],"stable":[136],"training":[137],"process.":[138],"Experiments":[139],"have":[140],"been":[141],"performed":[142],"on":[143],"text-to-music":[144,176],"music-style":[147],"transfer":[148],"tasks":[149],"open-source":[151],"instrumental":[152],"recording":[153],"data.":[154],"The":[155],"results":[156,180],"demonstrate":[157],"extending":[160],"StableAudio,":[161],"pre-trained":[163],"text-controlled":[164],"DiT":[165],"model,":[166],"our":[167],"enables":[169],"superior":[170],"melody-controlled":[171],"while":[173],"retaining":[174],"good":[175],"performance.":[178],"These":[179],"outperform":[181],"strong":[183],"MusicGen":[184],"baseline":[185],"terms":[187],"both":[189],"text-based":[190],"preservation":[194],"editing.":[196],"Audio":[197],"examples":[198],"can":[199],"be":[200],"found":[201],"at":[202],"https://stable-audio-control.github.io.":[203]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
