{"id":"https://openalex.org/W4414359277","doi":"https://doi.org/10.24963/ijcai.2025/1126","title":"QA-MDT: Quality-aware Masked Diffusion Transformer for Enhanced Music Generation","display_name":"QA-MDT: Quality-aware Masked Diffusion Transformer for Enhanced Music Generation","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https://openalex.org/W4414359277","doi":"https://doi.org/10.24963/ijcai.2025/1126"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2025/1126","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/1126","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100429537","display_name":"Chang Li","orcid":"https://orcid.org/0000-0003-0195-1003"},"institutions":[{"id":"https://openalex.org/I91090688","display_name":"University of Science and Technology Chittagong","ror":"https://ror.org/00w9tx359","country_code":"BD","type":"education","lineage":["https://openalex.org/I91090688"]}],"countries":["BD"],"is_corresponding":true,"raw_author_name":"Chang Li","raw_affiliation_strings":["USTC"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"USTC","institution_ids":["https://openalex.org/I91090688"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023016730","display_name":"Ruoyu Wang","orcid":"https://orcid.org/0000-0003-2172-3462"},"institutions":[{"id":"https://openalex.org/I91090688","display_name":"University of Science and Technology Chittagong","ror":"https://ror.org/00w9tx359","country_code":"BD","type":"education","lineage":["https://openalex.org/I91090688"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Ruoyu Wang","raw_affiliation_strings":["USTC"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"USTC","institution_ids":["https://openalex.org/I91090688"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080023722","display_name":"Lijuan Liu","orcid":"https://orcid.org/0009-0005-2617-5858"},"institutions":[{"id":"https://openalex.org/I91090688","display_name":"University of Science and Technology Chittagong","ror":"https://ror.org/00w9tx359","country_code":"BD","type":"education","lineage":["https://openalex.org/I91090688"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Lijuan Liu","raw_affiliation_strings":["USTC"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"USTC","institution_ids":["https://openalex.org/I91090688"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102708330","display_name":"Jun Du","orcid":"https://orcid.org/0000-0002-3387-2724"},"institutions":[{"id":"https://openalex.org/I91090688","display_name":"University of Science and Technology Chittagong","ror":"https://ror.org/00w9tx359","country_code":"BD","type":"education","lineage":["https://openalex.org/I91090688"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Jun Du","raw_affiliation_strings":["USTC"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"USTC","institution_ids":["https://openalex.org/I91090688"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102607853","display_name":"Yixuan Sun","orcid":"https://orcid.org/0009-0002-3199-4143"},"institutions":[{"id":"https://openalex.org/I91090688","display_name":"University of Science and Technology Chittagong","ror":"https://ror.org/00w9tx359","country_code":"BD","type":"education","lineage":["https://openalex.org/I91090688"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Yixuan Sun","raw_affiliation_strings":["USTC"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"USTC","institution_ids":["https://openalex.org/I91090688"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109725543","display_name":"Zilu Guo","orcid":"https://orcid.org/0009-0000-9011-1756"},"institutions":[{"id":"https://openalex.org/I91090688","display_name":"University of Science and Technology Chittagong","ror":"https://ror.org/00w9tx359","country_code":"BD","type":"education","lineage":["https://openalex.org/I91090688"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Zilu Guo","raw_affiliation_strings":["USTC"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"USTC","institution_ids":["https://openalex.org/I91090688"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101474835","display_name":"Zhengrong Zhang","orcid":"https://orcid.org/0000-0002-0608-8520"},"institutions":[{"id":"https://openalex.org/I91090688","display_name":"University of Science and Technology Chittagong","ror":"https://ror.org/00w9tx359","country_code":"BD","type":"education","lineage":["https://openalex.org/I91090688"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Zhengrong Zhang","raw_affiliation_strings":["USTC"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"USTC","institution_ids":["https://openalex.org/I91090688"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062592774","display_name":"Yuan Jiang","orcid":"https://orcid.org/0000-0002-4542-9578"},"institutions":[{"id":"https://openalex.org/I91090688","display_name":"University of Science and Technology Chittagong","ror":"https://ror.org/00w9tx359","country_code":"BD","type":"education","lineage":["https://openalex.org/I91090688"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Yuan Jiang","raw_affiliation_strings":["USTC"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"USTC","institution_ids":["https://openalex.org/I91090688"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020540834","display_name":"Jianqing Gao","orcid":"https://orcid.org/0000-0001-5575-4940"},"institutions":[{"id":"https://openalex.org/I4210133256","display_name":"Instituttet for Anvendt Datateknik (Denmark)","ror":"https://ror.org/02q83se13","country_code":"DK","type":"company","lineage":["https://openalex.org/I4210133256"]}],"countries":["DK"],"is_corresponding":false,"raw_author_name":"Jianqing Gao","raw_affiliation_strings":["iFlytek AI Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"iFlytek AI Research","institution_ids":["https://openalex.org/I4210133256"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5079840202","display_name":"Feng Ma","orcid":"https://orcid.org/0009-0004-5289-8075"},"institutions":[{"id":"https://openalex.org/I4210133256","display_name":"Instituttet for Anvendt Datateknik (Denmark)","ror":"https://ror.org/02q83se13","country_code":"DK","type":"company","lineage":["https://openalex.org/I4210133256"]}],"countries":["DK"],"is_corresponding":false,"raw_author_name":"Feng Ma","raw_affiliation_strings":["iFlytek AI Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"iFlytek AI Research","institution_ids":["https://openalex.org/I4210133256"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5100429537"],"corresponding_institution_ids":["https://openalex.org/I91090688"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.21890725,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"10135","last_page":"10143"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5956000089645386},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.41100001335144043},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.390500009059906},{"id":"https://openalex.org/keywords/musical","display_name":"Musical","score":0.3815999925136566},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.37059998512268066}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7257000207901001},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5956000089645386},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4390999972820282},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.41100001335144043},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.390500009059906},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.3815999925136566},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.37059998512268066},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3598000109195709},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.32899999618530273},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3068000078201294},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3059999942779541},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2025/1126","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/1126","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Text-to-music":[0],"(TTM)":[1],"generation,":[2],"which":[3,28],"converts":[4],"textual":[5],"descriptions":[6],"into":[7],"audio,":[8],"opens":[9],"up":[10],"innovative":[11],"avenues":[12],"for":[13,67,98,105],"multimedia":[14],"creation.":[15],"Achieving":[16],"high":[17],"quality":[18,106],"and":[19,45,90,108,134,141,152],"diversity":[20],"in":[21,32,81],"this":[22],"process":[23],"demands":[24],"extensive,":[25],"high-quality":[26],"data,":[27],"are":[29,147,155],"often":[30],"scarce":[31],"available":[33,148],"datasets.":[34,75],"Most":[35],"open-source":[36],"datasets":[37,131],"frequently":[38],"suffer":[39],"from":[40,72],"issues":[41],"like":[42],"low-quality":[43,121],"waveforms":[44],"low":[46],"text-audio":[47],"consistency,":[48],"hindering":[49],"the":[50,82,99,135],"advancement":[51],"of":[52,85],"music":[53,71],"generation":[54],"models.":[55],"To":[56],"address":[57,120],"these":[58],"challenges,":[59],"we":[60,88,112],"propose":[61],"a":[62,92,114],"novel":[63],"quality-aware":[64],"training":[65],"paradigm":[66],"generating":[68],"high-quality,":[69],"high-musicality":[70],"large-scale,":[73],"quality-imbalanced":[74],"Additionally,":[76],"by":[77],"leveraging":[78],"unique":[79],"properties":[80],"latent":[83],"space":[84],"musical":[86],"signals,":[87],"adapt":[89],"implement":[91],"masked":[93],"diffusion":[94],"transformer":[95],"(MDT)":[96],"model":[97],"TTM":[100],"task,":[101],"showcasing":[102],"its":[103],"capacity":[104],"control":[107],"enhanced":[109],"musicality.":[110],"Furthermore,":[111],"introduce":[113],"three-stage":[115],"caption":[116],"refinement":[117],"approach":[118],"to":[119],"captions'":[122],"issue.":[123],"Experiments":[124],"show":[125],"state-of-the-art":[126],"(SOTA)":[127],"performance":[128],"on":[129],"benchmark":[130],"including":[132],"MusicCaps":[133],"Song-Describer":[136],"Dataset":[137],"with":[138],"both":[139],"objective":[140],"subjective":[142],"metrics.":[143],"Demo":[144],"audio":[145],"samples":[146],"at":[149,157],"https://qa-mdt.github.io/,":[150],"code":[151],"pretrained":[153],"checkpoints":[154],"open-sourced":[156],"https://github.com/ivcylc/OpenMusic.":[158]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
