{"id":"https://openalex.org/W7148594802","doi":"https://doi.org/10.1109/asru65441.2025.11434644","title":"DiffRhythm+: Controllable and Flexible Full-Length Song Generation with Preference Optimization","display_name":"DiffRhythm+: Controllable and Flexible Full-Length Song Generation with Preference Optimization","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148594802","doi":"https://doi.org/10.1109/asru65441.2025.11434644"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434644","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434644","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132826339","display_name":"Huakang Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I2800643029","display_name":"Association for Symbolic Logic","ror":"https://ror.org/00baryp08","country_code":"US","type":"other","lineage":["https://openalex.org/I2800643029"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Huakang Chen","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU)"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU)","institution_ids":["https://openalex.org/I2800643029"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132798840","display_name":"Yuepeng Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I2800643029","display_name":"Association for Symbolic Logic","ror":"https://ror.org/00baryp08","country_code":"US","type":"other","lineage":["https://openalex.org/I2800643029"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yuepeng Jiang","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU)"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU)","institution_ids":["https://openalex.org/I2800643029"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132800136","display_name":"Guobin Ma","orcid":null},"institutions":[{"id":"https://openalex.org/I2800643029","display_name":"Association for Symbolic Logic","ror":"https://ror.org/00baryp08","country_code":"US","type":"other","lineage":["https://openalex.org/I2800643029"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Guobin Ma","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU)"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU)","institution_ids":["https://openalex.org/I2800643029"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103427978","display_name":"C.X. Hao","orcid":null},"institutions":[{"id":"https://openalex.org/I2800643029","display_name":"Association for Symbolic Logic","ror":"https://ror.org/00baryp08","country_code":"US","type":"other","lineage":["https://openalex.org/I2800643029"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chunbo Hao","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU)"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU)","institution_ids":["https://openalex.org/I2800643029"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059659225","display_name":"Shun Wang","orcid":"https://orcid.org/0009-0003-0210-4428"},"institutions":[{"id":"https://openalex.org/I308837","display_name":"Suzhou University of Science and Technology","ror":"https://ror.org/04en8wb91","country_code":"CN","type":"education","lineage":["https://openalex.org/I308837"]},{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuai Wang","raw_affiliation_strings":["Nanjing University,School of Intelligence Science and Technology,Suzhou,China"],"affiliations":[{"raw_affiliation_string":"Nanjing University,School of Intelligence Science and Technology,Suzhou,China","institution_ids":["https://openalex.org/I308837","https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132827523","display_name":"Jixun Yao","orcid":null},"institutions":[{"id":"https://openalex.org/I2800643029","display_name":"Association for Symbolic Logic","ror":"https://ror.org/00baryp08","country_code":"US","type":"other","lineage":["https://openalex.org/I2800643029"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jixun Yao","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU)"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU)","institution_ids":["https://openalex.org/I2800643029"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081164682","display_name":"Ziqian Ning","orcid":null},"institutions":[{"id":"https://openalex.org/I2800643029","display_name":"Association for Symbolic Logic","ror":"https://ror.org/00baryp08","country_code":"US","type":"other","lineage":["https://openalex.org/I2800643029"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ziqian Ning","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU)"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU)","institution_ids":["https://openalex.org/I2800643029"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132825711","display_name":"Meng Meng","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Meng Meng","raw_affiliation_strings":["MilM Plus Xiaomi Inc"],"affiliations":[{"raw_affiliation_string":"MilM Plus Xiaomi Inc","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132790143","display_name":"Jian Luan","orcid":null},"institutions":[{"id":"https://openalex.org/I862669128","display_name":"Xiaomi (China)","ror":"https://ror.org/029f7bn57","country_code":"CN","type":"company","lineage":["https://openalex.org/I862669128"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian Luan","raw_affiliation_strings":["MilM Plus Xiaomi Inc"],"affiliations":[{"raw_affiliation_string":"MilM Plus Xiaomi Inc","institution_ids":["https://openalex.org/I862669128"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132806191","display_name":"Lei Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I2800643029","display_name":"Association for Symbolic Logic","ror":"https://ror.org/00baryp08","country_code":"US","type":"other","lineage":["https://openalex.org/I2800643029"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP@NPU)"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP@NPU)","institution_ids":["https://openalex.org/I2800643029"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5132826339"],"corresponding_institution_ids":["https://openalex.org/I2800643029"],"apc_list":null,"apc_paid":null,"fwci":2.2732,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.90819981,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.5777000188827515,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.5777000188827515,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.14869999885559082,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.093299999833107,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/musical","display_name":"Musical","score":0.578499972820282},{"id":"https://openalex.org/keywords/melody","display_name":"Melody","score":0.5063999891281128},{"id":"https://openalex.org/keywords/controllability","display_name":"Controllability","score":0.501800000667572},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.4925999939441681},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4503999948501587},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.44769999384880066},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.44119998812675476},{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.42570000886917114},{"id":"https://openalex.org/keywords/repetition","display_name":"Repetition (rhetorical device)","score":0.41190001368522644}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6757000088691711},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.578499972820282},{"id":"https://openalex.org/C43803900","wikidata":"https://www.wikidata.org/wiki/Q170412","display_name":"Melody","level":3,"score":0.5063999891281128},{"id":"https://openalex.org/C48209547","wikidata":"https://www.wikidata.org/wiki/Q1331104","display_name":"Controllability","level":2,"score":0.501800000667572},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.4925999939441681},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4503999948501587},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.44769999384880066},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.44119998812675476},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.42570000886917114},{"id":"https://openalex.org/C2776141515","wikidata":"https://www.wikidata.org/wiki/Q1274479","display_name":"Repetition (rhetorical device)","level":2,"score":0.41190001368522644},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40860000252723694},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.3977999985218048},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.3675999939441681},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.36239999532699585},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.36010000109672546},{"id":"https://openalex.org/C2776505523","wikidata":"https://www.wikidata.org/wiki/Q4785468","display_name":"Plan (archaeology)","level":2,"score":0.3546000123023987},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.3450999855995178},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3398999869823456},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.3361000120639801},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.32280001044273376},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3206999897956848},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.31139999628067017},{"id":"https://openalex.org/C2777413886","wikidata":"https://www.wikidata.org/wiki/Q3276013","display_name":"Fluency","level":2,"score":0.3070000112056732},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.28679999709129333},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.2678000032901764},{"id":"https://openalex.org/C2983311337","wikidata":"https://www.wikidata.org/wiki/Q34379","display_name":"Musical instrument","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2540000081062317},{"id":"https://openalex.org/C2776436406","wikidata":"https://www.wikidata.org/wiki/Q602446","display_name":"Lyrics","level":2,"score":0.2533999979496002},{"id":"https://openalex.org/C88639978","wikidata":"https://www.wikidata.org/wiki/Q233861","display_name":"Musical notation","level":3,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434644","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434644","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7881514430046082}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W2990356585","https://openalex.org/W2991464959","https://openalex.org/W3031000691","https://openalex.org/W3158762648","https://openalex.org/W3207340675","https://openalex.org/W4391924849","https://openalex.org/W4392903114","https://openalex.org/W4398226295","https://openalex.org/W4401043564","https://openalex.org/W4402111425","https://openalex.org/W4402112429","https://openalex.org/W4402671802","https://openalex.org/W4402905495","https://openalex.org/W4403780831","https://openalex.org/W4409363101","https://openalex.org/W4412888825","https://openalex.org/W4413472064","https://openalex.org/W4415796994","https://openalex.org/W7133208539","https://openalex.org/W7133222008","https://openalex.org/W7133229781"],"related_works":[],"abstract_inverted_index":{"Songs,":[0],"as":[1,123],"a":[2,50,112,143],"central":[3],"form":[4],"of":[5,11,127,134],"musical":[6,47,81,136,153],"art,":[7],"exemplify":[8],"the":[9,55,132,180],"richness":[10],"human":[12],"intelligence":[13],"and":[14,45,64,77,88,105,115,125,138,159,167,201],"creativity.":[15],"While":[16],"recent":[17],"advances":[18],"in":[19,26,84,197],"generative":[20],"modeling":[21],"have":[22],"enabled":[23],"notable":[24],"progress":[25],"long-form":[27],"song":[28,34,108],"generation,":[29],"current":[30],"systems":[31],"for":[32,103],"fulllength":[33],"synthesis":[35],"still":[36],"face":[37],"major":[38],"challenges,":[39],"including":[40],"data":[41],"imbalance,":[42],"insufficient":[43],"controllability,":[44],"inconsistent":[46],"quality.":[48],"DiffRhythm,":[49],"pioneering":[51],"diffusion-based":[52],"model,":[53],"advanced":[54],"field":[56],"by":[57,71],"generating":[58],"full-length":[59,107],"songs":[60],"with":[61,176],"expressive":[62],"vocals":[63],"accompaniment.":[65],"However,":[66],"its":[67],"performance":[68,173],"was":[69],"constrained":[70],"an":[72,99],"unbalanced":[73],"model":[74,181],"training":[75,117],"dataset":[76,118],"limited":[78],"controllability":[79],"over":[80,204],"style,":[82],"resulting":[83],"noticeable":[85],"quality":[86],"disparities":[87],"restricted":[89],"creative":[90,165],"flexibility.":[91],"To":[92],"address":[93],"these":[94],"limitations,":[95],"we":[96],"propose":[97],"DiffRhythm+,":[98],"enhanced":[100],"diffusionbased":[101],"framework":[102,141],"controllable":[104],"flexible":[106],"generation.":[109],"DiffRhythm+":[110,193],"leverages":[111],"substantially":[113],"expanded":[114],"balanced":[116],"to":[119,150],"mitigate":[120],"issues":[121],"such":[122],"repetition":[124],"omission":[126],"lyrics,":[128],"while":[129],"also":[130],"fostering":[131],"emergence":[133],"richer":[135],"skills":[137],"expressiveness.":[139],"The":[140],"introduces":[142],"multi-modal":[144],"style":[145],"conditioning":[146],"strategy,":[147],"enabling":[148],"users":[149],"precisely":[151],"specify":[152],"styles":[154],"through":[155],"both":[156],"descriptive":[157],"text":[158],"reference":[160],"audio,":[161],"thereby":[162],"significantly":[163],"enhancing":[164],"control":[166],"diversity.":[168],"We":[169],"further":[170],"introduce":[171],"direct":[172],"optimization":[174],"aligned":[175],"user":[177],"preferences,":[178],"guiding":[179],"toward":[182],"consistently":[183],"preferred":[184],"outputs":[185],"across":[186],"evaluation":[187],"metrics.":[188],"Extensive":[189],"experiments":[190],"demonstrate":[191],"that":[192],"achieves":[194],"significant":[195],"improvements":[196],"naturalness,":[198],"arrangement":[199],"complexity,":[200],"listener":[202],"satisfaction":[203],"previous":[205],"systems.":[206],"Audio":[207],"samples":[208],"are":[209],"available":[210],"at":[211],"https://longwaytog0.github.io/DiffRhythmPlus/.":[212]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2026-04-03T00:00:00"}
