{"id":"https://openalex.org/W4415709700","doi":"https://doi.org/10.1109/icme59968.2025.11208972","title":"Pop-Diffuseq: Controllable Symbolic Music Multi-Instrument Infilling and Accompaniment Generation with Long-Axis Attention","display_name":"Pop-Diffuseq: Controllable Symbolic Music Multi-Instrument Infilling and Accompaniment Generation with Long-Axis Attention","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415709700","doi":"https://doi.org/10.1109/icme59968.2025.11208972"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11208972","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11208972","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073120743","display_name":"Yi Zou","orcid":"https://orcid.org/0000-0003-3470-5781"},"institutions":[{"id":"https://openalex.org/I75689368","display_name":"Communication University of China","ror":"https://ror.org/04facbs33","country_code":"CN","type":"education","lineage":["https://openalex.org/I75689368"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yi Zou","raw_affiliation_strings":["Communication University of China,School of Information and Communication Engineering,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Communication University of China,School of Information and Communication Engineering,Beijing,China","institution_ids":["https://openalex.org/I75689368"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065017752","display_name":"Haonan Cheng","orcid":"https://orcid.org/0000-0003-3407-4318"},"institutions":[{"id":"https://openalex.org/I75689368","display_name":"Communication University of China","ror":"https://ror.org/04facbs33","country_code":"CN","type":"education","lineage":["https://openalex.org/I75689368"]},{"id":"https://openalex.org/I4391768176","display_name":"State Key Laboratory of Media Convergence and Communication","ror":"https://ror.org/0595ys057","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391768176","https://openalex.org/I75689368"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haonan Cheng","raw_affiliation_strings":["Communication University of China,State Key Laboratory of Media Convergence and Communication,Beijing,China,100024"],"affiliations":[{"raw_affiliation_string":"Communication University of China,State Key Laboratory of Media Convergence and Communication,Beijing,China,100024","institution_ids":["https://openalex.org/I75689368","https://openalex.org/I4391768176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100600271","display_name":"Long Ye","orcid":"https://orcid.org/0000-0002-3562-5612"},"institutions":[{"id":"https://openalex.org/I75689368","display_name":"Communication University of China","ror":"https://ror.org/04facbs33","country_code":"CN","type":"education","lineage":["https://openalex.org/I75689368"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Ye","raw_affiliation_strings":["Communication University of China,School of Data Science and Intelligent Media,Beijing,China,100024"],"affiliations":[{"raw_affiliation_string":"Communication University of China,School of Data Science and Intelligent Media,Beijing,China,100024","institution_ids":["https://openalex.org/I75689368"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5114378014","display_name":"Qin Zhang","orcid":"https://orcid.org/0009-0001-0205-6986"},"institutions":[{"id":"https://openalex.org/I4210110997","display_name":"Ministry of Education","ror":"https://ror.org/01xexqx38","country_code":"ME","type":"government","lineage":["https://openalex.org/I4210110997"]}],"countries":["ME"],"is_corresponding":false,"raw_author_name":"Qin Zhang","raw_affiliation_strings":["Key Laboratory of Media Audio &amp; Video, (Communication University of China),Ministry of Education,Beijing,China,100024"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Media Audio &amp; Video, (Communication University of China),Ministry of Education,Beijing,China,100024","institution_ids":["https://openalex.org/I4210110997"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5073120743"],"corresponding_institution_ids":["https://openalex.org/I75689368"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.39433003,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.753000020980835,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.753000020980835,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.22380000352859497,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10788","display_name":"Neuroscience and Music Perception","score":0.003700000001117587,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/controllability","display_name":"Controllability","score":0.5975000262260437},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4564000070095062},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.42500001192092896},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4020000100135803},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.38609999418258667},{"id":"https://openalex.org/keywords/mood","display_name":"Mood","score":0.321399986743927}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6492000222206116},{"id":"https://openalex.org/C48209547","wikidata":"https://www.wikidata.org/wiki/Q1331104","display_name":"Controllability","level":2,"score":0.5975000262260437},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4867999851703644},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4564000070095062},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.42500001192092896},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4020000100135803},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.38609999418258667},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.351500004529953},{"id":"https://openalex.org/C2780733359","wikidata":"https://www.wikidata.org/wiki/Q331769","display_name":"Mood","level":2,"score":0.321399986743927},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.31459999084472656},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3100999891757965},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2766999900341034},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2621999979019165}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11208972","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11208972","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2772474126","https://openalex.org/W2898827701","https://openalex.org/W2964110616","https://openalex.org/W3092879656","https://openalex.org/W3173187964","https://openalex.org/W3175663427","https://openalex.org/W4382239690","https://openalex.org/W4385245566","https://openalex.org/W4385763800","https://openalex.org/W4389518872","https://openalex.org/W4389519533","https://openalex.org/W4401044003"],"related_works":[],"abstract_inverted_index":{"Controllability":[0],"is":[1,140],"a":[2,83,104],"major":[3],"challenge":[4],"in":[5,27],"music":[6,64],"infilling":[7],"and":[8,30,61,111,129],"accompaniment":[9],"tasks.":[10],"Solutions":[11],"based":[12],"on":[13,40,119],"transformer":[14],"decoders":[15],"have":[16],"been":[17],"widely":[18],"adopted,":[19],"while":[20,66],"data-driven":[21],"approaches":[22],"with":[23,90,108],"full":[24],"self-attention":[25],"result":[26],"high":[28],"costs":[29],"unsatisfied":[31],"outcomes":[32],"for":[33,76],"fine-grained":[34],"control.":[35],"Existing":[36],"diffusion":[37,75],"methods":[38,56],"rely":[39],"trained":[41],"classifiers,":[42],"unconditional":[43],"frameworks,":[44],"or":[45],"solo":[46],"track,":[47],"etc.":[48],"To":[49],"address":[50],"these":[51],"issues,":[52],"we":[53,71,81,102],"explore":[54],"novel":[55],"to":[57,93],"enhance":[58],"the":[59,73,95,120,146],"controllability":[60],"quality":[62],"of":[63,98,137],"model":[65,139],"reducing":[67],"computational":[68],"complexity.":[69],"Firstly,":[70],"improve":[72],"classifier-free":[74],"multi-instrumental":[77],"pop":[78,105],"music.":[79],"Secondly,":[80],"design":[82],"long-axis":[84],"attention":[85,92],"algorithm":[86],"that":[87],"combines":[88],"long":[89],"axial":[91],"acquire":[94],"feature":[96],"correlations":[97],"multi-dimensional":[99],"attributes.":[100],"Additionally,":[101],"contribute":[103],"band":[106],"dataset":[107],"melody,":[109],"style":[110],"mood":[112],"labels":[113],"handcrafted":[114],"by":[115],"musicians.":[116],"After":[117],"experiments":[118],"benchmark":[121],"dataset,":[122],"our":[123,138],"method":[124],"demonstrates":[125],"high-quality":[126],"controllable":[127],"results":[128],"outperforms":[130],"existing":[131],"state-of-the-art":[132],"models.":[133],"The":[134],"GPU":[135],"memory":[136],"26.9%":[141],"lower":[142],"than":[143],"Diffuseq":[144],"under":[145],"same":[147],"hyperparameters.":[148]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-30T00:00:00"}
